From 7a0cada18ec705f13cdcf5fc709fae9c9906c811 Mon Sep 17 00:00:00 2001 From: Cardiff NLP Date: Sat, 17 Apr 2021 06:25:37 +0000 Subject: [PATCH] Update README.md --- README.md | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index a79e3a0..c6c386c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ +--- +language: multilingual +widget: +- text: "T'estimo!" +- text: "I love you!" +- text: "I hate you" +- text: "Mahal kita!" +- text: "사랑해!" +- text: "난 너가 싫어" +--- + + # twitter-XLM-roBERTa-base for Sentiment Analysis This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in @@ -5,7 +17,16 @@ This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for senti - Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...). - Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t). -## Example of classification +## Example Pipeline +```python +from transformers import pipeline +model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment" +sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path) + +sentiment_task("T'estimo!") +``` + +## Full classification example ```python from transformers import AutoModelForSequenceClassification @@ -13,32 +34,20 @@ from transformers import TFAutoModelForSequenceClassification from transformers import AutoTokenizer import numpy as np from scipy.special import softmax -import csv -import urllib.request # Preprocess text (username and link placeholders) def preprocess(text): new_text = [] - - for t in text.split(" "): t = '@user' if t.startswith('@') and len(t) > 1 else t t = 'http' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) +MODEL = f"/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/twitter-xlm-roberta-base-sentiment" -MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment" tokenizer = AutoTokenizer.from_pretrained(MODEL) - -# download label mapping -labels=[] -mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt" -with urllib.request.urlopen(mapping_link) as f: - html = f.read().decode('utf-8').split("\\ -") - csvreader = csv.reader(html, delimiter='\\\\t') -labels = [row[1] for row in csvreader if len(row) > 1] +config = AutoConfig.from_pretrained(MODEL) # PT model = AutoModelForSequenceClassification.from_pretrained(MODEL) @@ -61,10 +70,11 @@ scores = softmax(scores) # scores = output[0][0].numpy() # scores = softmax(scores) +# Print labels and scores ranking = np.argsort(scores) ranking = ranking[::-1] for i in range(scores.shape[0]): - l = labels[ranking[i]] + l = config.id2label[ranking[i]] s = scores[ranking[i]] print(f"{i+1}) {l} {np.round(float(s), 4)}") @@ -73,8 +83,8 @@ for i in range(scores.shape[0]): Output: ``` -1) positive 0.76726073 -2) neutral 0.201 -3) negative 0.0312 +1) Positive 0.7673 +2) Neutral 0.2015 +3) Negative 0.0313 ```