Adding tweeteval classifier

2020-11-12 17:31:20 +00:00 · 2020-11-12 17:31:20 +00:00 · c3f00119f3
parent c6eb303fb1
commit c3f00119f3
8 changed files with 50182 additions and 0 deletions
--- a/.ipynb_checkpoints/README-checkpoint.md
+++ b/.ipynb_checkpoints/README-checkpoint.md
@ -0,0 +1,70 @@
+# Twitter-roBERTa-base
+
+This is a roBERTa-base model trained on ~58M tweets and finetuned for the Sentiment Analysis task at Semeval 2018. 
+For full description: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). 
+To evaluate this and other models on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
+
+## Example of classification
+
+```python
+from transformers import AutoModelForSequenceClassification
+from transformers import TFAutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import numpy as np
+from scipy.special import softmax
+import csv
+import urllib.request
+
+# Tasks:
+# emoji, emotion, hate, irony, offensive, sentiment
+# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
+
+task='sentiment'
+MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+# download label mapping
+labels=[]
+mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
+with urllib.request.urlopen(mapping_link) as f:
+    html = f.read().decode('utf-8').split("\n")
+    spamreader = csv.reader(html[:-1], delimiter='\t')
+labels = [row[1] for row in spamreader]
+
+# PT
+model = AutoModelForSequenceClassification.from_pretrained(MODEL)
+model.save_pretrained(MODEL)
+
+text = "Good night 😊"
+encoded_input = tokenizer(text, return_tensors='pt')
+output = model(**encoded_input)
+scores = output[0][0].detach().numpy()
+scores = softmax(scores)
+
+# # TF
+# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
+# model.save_pretrained(MODEL)
+
+# text = "Good night 😊"
+# encoded_input = tokenizer(text, return_tensors='tf')
+# output = model(encoded_input)
+# scores = output[0][0].numpy()
+# scores = softmax(scores)
+
+ranking = np.argsort(scores)
+ranking = ranking[::-1]
+for i in range(scores.shape[0]):
+    l = labels[ranking[i]]
+    s = scores[ranking[i]]
+    print(f"{i+1}) {l} {np.round(float(s), 4)}")
+
+```
+
+Output: 
+
+```
+1) positive 0.8466
+2) neutral 0.1458
+3) negative 0.0076
+```
--- a/README.md
+++ b/README.md
@ -0,0 +1,70 @@
+# Twitter-roBERTa-base
+
+This is a roBERTa-base model trained on ~58M tweets and finetuned for the Sentiment Analysis task at Semeval 2018. 
+For full description: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). 
+To evaluate this and other models on Twitter-specific data, please refer to the [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval).
+
+## Example of classification
+
+```python
+from transformers import AutoModelForSequenceClassification
+from transformers import TFAutoModelForSequenceClassification
+from transformers import AutoTokenizer
+import numpy as np
+from scipy.special import softmax
+import csv
+import urllib.request
+
+# Tasks:
+# emoji, emotion, hate, irony, offensive, sentiment
+# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary
+
+task='sentiment'
+MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+# download label mapping
+labels=[]
+mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
+with urllib.request.urlopen(mapping_link) as f:
+    html = f.read().decode('utf-8').split("\n")
+    spamreader = csv.reader(html[:-1], delimiter='\t')
+labels = [row[1] for row in spamreader]
+
+# PT
+model = AutoModelForSequenceClassification.from_pretrained(MODEL)
+model.save_pretrained(MODEL)
+
+text = "Good night 😊"
+encoded_input = tokenizer(text, return_tensors='pt')
+output = model(**encoded_input)
+scores = output[0][0].detach().numpy()
+scores = softmax(scores)
+
+# # TF
+# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
+# model.save_pretrained(MODEL)
+
+# text = "Good night 😊"
+# encoded_input = tokenizer(text, return_tensors='tf')
+# output = model(encoded_input)
+# scores = output[0][0].numpy()
+# scores = softmax(scores)
+
+ranking = np.argsort(scores)
+ranking = ranking[::-1]
+for i in range(scores.shape[0]):
+    l = labels[ranking[i]]
+    s = scores[ranking[i]]
+    print(f"{i+1}) {l} {np.round(float(s), 4)}")
+
+```
+
+Output: 
+
+```
+1) positive 0.8466
+2) neutral 0.1458
+3) negative 0.0076
+```
--- a/config.json
+++ b/config.json
@ -0,0 +1,33 @@
+{
+  "_name_or_path": "tweeteval_new/roberta-base-rt-sentiment/",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "type_vocab_size": 1,
+  "vocab_size": 50265
+}
--- a/merges.txt
+++ b/merges.txt
--- a/pytorch_model.bin
+++ b/pytorch_model.bin
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -0,0 +1 @@
+{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
--- a/tf_model.h5
+++ b/tf_model.h5
--- a/vocab.json
+++ b/vocab.json
				`@ -0,0 +1 @@`
				`{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}`