diff --git a/README.md b/README.md index 0c43008..985c51d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Twitter-roBERTa-base +# Twitter-roBERTa-base for Sentiment Analysis This is a roBERTa-base model trained on ~58M tweets and finetuned for the Sentiment Analysis task at Semeval 2018. For full description: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). @@ -6,6 +6,15 @@ To evaluate this and other models on Twitter-specific data, please refer to the ## Example of classification +# Preprocess text (username and link placeholders) +def preprocess(text): + new_text = [] + for t in text.split(" "): + t = '@user' if t.startswith('@') and len(t) > 1 else t + t = 'http' if t.startswith('http') else t + new_text.append(t) + return " ".join(new_text) + ```python from transformers import AutoModelForSequenceClassification from transformers import TFAutoModelForSequenceClassification @@ -37,6 +46,7 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL) model.save_pretrained(MODEL) text = "Good night 😊" +text = preprocess(text) encoded_input = tokenizer(text, return_tensors='pt') output = model(**encoded_input) scores = output[0][0].detach().numpy()