cardiffnlp/twitter-roberta-base-sentiment is a forked repo from huggingface. License: None
Go to file
Patrick von Platen c8c5458081 upload flax model 2021-05-20 15:06:21 +00:00
.ipynb_checkpoints Adding tweeteval classifier 2020-11-12 17:31:20 +00:00
.gitattributes allow flax 2021-05-20 15:06:02 +00:00
README.md Update README.md 2020-11-13 11:23:30 +00:00
config.json Adding tweeteval classifier 2020-11-12 17:31:20 +00:00
flax_model.msgpack upload flax model 2021-05-20 15:06:21 +00:00
merges.txt Adding tweeteval classifier 2020-11-12 17:31:20 +00:00
pytorch_model.bin Adding tweeteval classifier 2020-11-12 17:31:20 +00:00
special_tokens_map.json Adding tweeteval classifier 2020-11-12 17:31:20 +00:00
tf_model.h5 Adding tweeteval classifier 2020-11-12 17:31:20 +00:00
vocab.json Adding tweeteval classifier 2020-11-12 17:31:20 +00:00

README.md

Twitter-roBERTa-base for Sentiment Analysis

This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark.

Example of classification

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Output:

1) positive 0.8466
2) neutral 0.1458
3) negative 0.0076