Compare commits

...

10 Commits

Author SHA1 Message Date
luyongjian 39e233cfd0 Update README.md 2022-10-22 07:21:15 +00:00
luyongjian a3aa741c87 Update README.md
add github link
2022-10-22 06:26:29 +00:00
luyongjian e6d8f2ff4d Update README.md
update readme
2022-09-19 03:57:05 +00:00
luyongjian 5312b0749f Upload README.md 2022-07-16 02:43:10 +00:00
luyongjian 670752485b Upload config.json 2022-07-16 02:43:02 +00:00
luyongjian 9dfabb0e56 Upload preprocessor_config.json 2022-07-16 02:42:51 +00:00
luyongjian 5426ff5d89 Upload special_tokens_map.json 2022-07-16 02:42:41 +00:00
luyongjian 15a0a521e6 Upload tokenizer_config.json 2022-07-16 02:42:32 +00:00
luyongjian dc4b5a73f0 Upload vocab.json 2022-07-16 02:42:19 +00:00
luyongjian 882787221f pytorch model upload 2022-07-16 02:19:56 +00:00
7 changed files with 217 additions and 0 deletions

48
README.md Normal file
View File

@ -0,0 +1,48 @@
---
language: en
datasets:
- LIUM/tedlium
tags:
- speech
- audio
- automatic-speech-recognition
---
Finetuned from [facebook/wav2vec2-large-960h-lv60-self](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self).
# Installation
1. PyTorch installation: https://pytorch.org/
2. Install transformers: https://huggingface.co/docs/transformers/installation
e.g., installation by conda
```
>> conda create -n wav2vec2 python=3.8
>> conda install pytorch cudatoolkit=11.3 -c pytorch
>> conda install -c conda-forge transformers
```
# Usage
```python
# Load the model and processor
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import numpy as np
import torch
model = Wav2Vec2ForCTC.from_pretrained(r'yongjian/wav2vec2-large-a') # Note: PyTorch Model
processor = Wav2Vec2Processor.from_pretrained(r'yongjian/wav2vec2-large-a')
# Load input
np_wav = np.random.normal(size=(16000)).clip(-1, 1) # change it to your sample
# Inference
sample_rate = processor.feature_extractor.sampling_rate
with torch.no_grad():
model_inputs = processor(np_wav, sampling_rate=sample_rate, return_tensors="pt", padding=True)
logits = model(model_inputs.input_values, attention_mask=model_inputs.attention_mask).logits # use .cuda() for GPU acceleration
pred_ids = torch.argmax(logits, dim=-1).cpu()
pred_text = processor.batch_decode(pred_ids)
print('Transcription:', pred_text)
```
# Code
GitHub Repo:
https://github.com/CassiniHuy/wav2vec2_finetune

108
config.json Normal file
View File

@ -0,0 +1,108 @@
{
"activation_dropout": 0.1,
"adapter_kernel_size": 3,
"adapter_stride": 2,
"add_adapter": false,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"classifier_proj_size": 256,
"codevector_dim": 256,
"contrastive_logits_temperature": 0.1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"diversity_loss_weight": 0.1,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"feat_quantizer_dropout": 0.0,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_min_masks": 0,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_masks": 2,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_adapter_layers": 3,
"num_attention_heads": 16,
"num_codevector_groups": 2,
"num_codevectors_per_group": 320,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"num_negatives": 100,
"output_hidden_size": 1024,
"pad_token_id": 0,
"proj_codevector_dim": 256,
"tdnn_dilation": [
1,
2,
3,
1,
1
],
"tdnn_dim": [
512,
512,
512,
512,
1500
],
"tdnn_kernel": [
5,
3,
3,
1,
1
],
"torch_dtype": "float32",
"transformers_version": "4.19.4",
"use_weighted_layer_sum": false,
"vocab_size": 32,
"xvector_output_dim": 512
}

10
preprocessor_config.json Normal file
View File

@ -0,0 +1,10 @@
{
"do_normalize": true,
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
"feature_size": 1,
"padding_side": "right",
"padding_value": 0.0,
"processor_class": "Wav2Vec2Processor",
"return_attention_mask": true,
"sampling_rate": 16000
}

BIN
pytorch_model.bin (Stored with Git LFS) Normal file

Binary file not shown.

1
special_tokens_map.json Normal file
View File

@ -0,0 +1 @@
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}

13
tokenizer_config.json Normal file
View File

@ -0,0 +1,13 @@
{
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>",
"do_lower_case": false,
"word_delimiter_token": "|",
"replace_word_delimiter_char": " ",
"return_attention_mask": true,
"do_normalize": true,
"processor_class": "Wav2Vec2Processor",
"tokenizer_class": "Wav2Vec2CTCTokenizer"
}

34
vocab.json Normal file
View File

@ -0,0 +1,34 @@
{
"<pad>": 0,
"<s>": 1,
"</s>": 2,
"<unk>": 3,
"|": 4,
"E": 5,
"T": 6,
"A": 7,
"O": 8,
"N": 9,
"I": 10,
"H": 11,
"S": 12,
"R": 13,
"D": 14,
"L": 15,
"U": 16,
"M": 17,
"W": 18,
"C": 19,
"F": 20,
"G": 21,
"Y": 22,
"P": 23,
"B": 24,
"V": 25,
"K": 26,
"'": 27,
"X": 28,
"J": 29,
"Q": 30,
"Z": 31
}