Update README.md

This commit is contained in:
Cardiff NLP 2021-04-17 06:25:37 +00:00 committed by huggingface-web
parent 1f8684fe27
commit 7a0cada18e
1 changed files with 29 additions and 19 deletions

View File

@ -1,3 +1,15 @@
---
language: multilingual
widget:
- text: "T'estimo!"
- text: "I love you!"
- text: "I hate you"
- text: "Mahal kita!"
- text: "사랑해!"
- text: "난 너가 싫어"
---
# twitter-XLM-roBERTa-base for Sentiment Analysis # twitter-XLM-roBERTa-base for Sentiment Analysis
This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis in
@ -5,7 +17,16 @@ This is a XLM-roBERTa-base model trained on ~198M tweets and finetuned for senti
- Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...). - Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://...).
- Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t). - Git Repo: [Tweeteval official repository](https://github.com/cardiffnlp/xlm-t).
## Example of classification ## Example Pipeline
```python
from transformers import pipeline
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
sentiment_task("T'estimo!")
```
## Full classification example
```python ```python
from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSequenceClassification
@ -13,32 +34,20 @@ from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer from transformers import AutoTokenizer
import numpy as np import numpy as np
from scipy.special import softmax from scipy.special import softmax
import csv
import urllib.request
# Preprocess text (username and link placeholders) # Preprocess text (username and link placeholders)
def preprocess(text): def preprocess(text):
new_text = [] new_text = []
for t in text.split(" "): for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t t = 'http' if t.startswith('http') else t
new_text.append(t) new_text.append(t)
return " ".join(new_text) return " ".join(new_text)
MODEL = f"/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/twitter-xlm-roberta-base-sentiment"
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL) tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
html = f.read().decode('utf-8').split("\\
")
csvreader = csv.reader(html, delimiter='\\\\t')
labels = [row[1] for row in csvreader if len(row) > 1]
# PT # PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL) model = AutoModelForSequenceClassification.from_pretrained(MODEL)
@ -61,10 +70,11 @@ scores = softmax(scores)
# scores = output[0][0].numpy() # scores = output[0][0].numpy()
# scores = softmax(scores) # scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores) ranking = np.argsort(scores)
ranking = ranking[::-1] ranking = ranking[::-1]
for i in range(scores.shape[0]): for i in range(scores.shape[0]):
l = labels[ranking[i]] l = config.id2label[ranking[i]]
s = scores[ranking[i]] s = scores[ranking[i]]
print(f"{i+1}) {l} {np.round(float(s), 4)}") print(f"{i+1}) {l} {np.round(float(s), 4)}")
@ -73,8 +83,8 @@ for i in range(scores.shape[0]):
Output: Output:
``` ```
1) positive 0.76726073 1) Positive 0.7673
2) neutral 0.201 2) Neutral 0.2015
3) negative 0.0312 3) Negative 0.0313
``` ```