Adding tweeteval classifier

2020-11-12 17:31:20 +00:00 · 2020-11-12 17:31:20 +00:00 · c3f00119f3
parent c6eb303fb1
commit c3f00119f3
8 changed files with 50182 additions and 0 deletions
--- a/.ipynb_checkpoints/README-checkpoint.md
+++ b/.ipynb_checkpoints/README-checkpoint.md
@ -0,0 +1,70 @@
 # Twitter-roBERTa-base
 This is a roBERTa-base model trained on ~58M tweets and finetuned for the Sentiment Analysis task at Semeval 2018. 
 For full description: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). 
 To evaluate this and other models on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
 ## Example of classification
 ```python
 from transformers import AutoModelForSequenceClassification
 from transformers import TFAutoModelForSequenceClassification
 from transformers import AutoTokenizer
 import numpy as np
 from scipy.special import softmax
 import csv
 import urllib.request
 # Tasks:
 # emoji, emotion, hate, irony, offensive, sentiment
 # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
 task='sentiment'
 MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 # download label mapping
 labels=[]
 mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
 with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    spamreader = csv.reader(html[:-1], delimiter='\t')
 labels = [row[1] for row in spamreader]
 # PT
 model = AutoModelForSequenceClassification.from_pretrained(MODEL)
 model.save_pretrained(MODEL)
 text = "Good night 😊"
 encoded_input = tokenizer(text, return_tensors='pt')
 output = model(**encoded_input)
 scores = output[0][0].detach().numpy()
 scores = softmax(scores)
 # # TF
 # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
 # model.save_pretrained(MODEL)
 # text = "Good night 😊"
 # encoded_input = tokenizer(text, return_tensors='tf')
 # output = model(encoded_input)
 # scores = output[0][0].numpy()
 # scores = softmax(scores)
 ranking = np.argsort(scores)
 ranking = ranking[::-1]
 for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
 ```
 Output: 
 ```
 1) positive 0.8466
 2) neutral 0.1458
 3) negative 0.0076
 ```
--- a/README.md
+++ b/README.md
@ -0,0 +1,70 @@
 # Twitter-roBERTa-base
 This is a roBERTa-base model trained on ~58M tweets and finetuned for the Sentiment Analysis task at Semeval 2018. 
 For full description: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). 
 To evaluate this and other models on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
 ## Example of classification
 ```python
 from transformers import AutoModelForSequenceClassification
 from transformers import TFAutoModelForSequenceClassification
 from transformers import AutoTokenizer
 import numpy as np
 from scipy.special import softmax
 import csv
 import urllib.request
 # Tasks:
 # emoji, emotion, hate, irony, offensive, sentiment
 # stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
 task='sentiment'
 MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 # download label mapping
 labels=[]
 mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
 with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    spamreader = csv.reader(html[:-1], delimiter='\t')
 labels = [row[1] for row in spamreader]
 # PT
 model = AutoModelForSequenceClassification.from_pretrained(MODEL)
 model.save_pretrained(MODEL)
 text = "Good night 😊"
 encoded_input = tokenizer(text, return_tensors='pt')
 output = model(**encoded_input)
 scores = output[0][0].detach().numpy()
 scores = softmax(scores)
 # # TF
 # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
 # model.save_pretrained(MODEL)
 # text = "Good night 😊"
 # encoded_input = tokenizer(text, return_tensors='tf')
 # output = model(encoded_input)
 # scores = output[0][0].numpy()
 # scores = softmax(scores)
 ranking = np.argsort(scores)
 ranking = ranking[::-1]
 for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
 ```
 Output: 
 ```
 1) positive 0.8466
 2) neutral 0.1458
 3) negative 0.0076
 ```
--- a/config.json
+++ b/config.json
@ -0,0 +1,33 @@
 {
  "_name_or_path": "tweeteval_new/roberta-base-rt-sentiment/",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
 }
--- a/merges.txt
+++ b/merges.txt
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -0,0 +1 @@
 {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
--- a/tf_model.h5
+++ b/tf_model.h5
--- a/vocab.json
+++ b/vocab.json
		`@ -0,0 +1 @@`
							`{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}`