diff --git a/README.md b/README.md index ce919be..a79e3a0 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,9 @@ # twitter-XLM-roBERTa-base for Sentiment Analysis +This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in - -TODO: create model card - - - - - - - -This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. - -- Paper: [_TweetEval_ benchmark (Findings of EMNLP 2020)](https://arxiv.org/pdf/2010.12421.pdf). -- Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/tweeteval). +- Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...). +- Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t). ## Example of classification @@ -37,22 +27,17 @@ def preprocess(text): new_text.append(t) return " ".join(new_text) -# Tasks: -# emoji, emotion, hate, irony, offensive, sentiment -# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary - -task='sentiment' -MODEL = f"cardiffnlp/twitter-roberta-base-{task}" +MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment" tokenizer = AutoTokenizer.from_pretrained(MODEL) # download label mapping labels=[] -mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt" +mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt" with urllib.request.urlopen(mapping_link) as f: - html = f.read().decode('utf-8').split("\ + html = f.read().decode('utf-8').split("\\ ") - csvreader = csv.reader(html, delimiter='\\t') + csvreader = csv.reader(html, delimiter='\\\\t') labels = [row[1] for row in csvreader if len(row) > 1] # PT @@ -88,8 +73,8 @@ for i in range(scores.shape[0]): Output: ``` -1) positive 0.8466 -2) neutral 0.1458 -3) negative 0.0076 +1) positive 0.76726073 +2) neutral 0.201 +3) negative 0.0312 ```