From 09c90226c4c84a46a43f97071bf944f749096861 Mon Sep 17 00:00:00 2001 From: An Yang Date: Thu, 1 Dec 2022 03:24:24 +0000 Subject: [PATCH] Update README.md --- README.md | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index cb28c77..96398a2 100644 --- a/README.md +++ b/README.md @@ -9,42 +9,36 @@ license: apache-2.0 This is the base-version of the Chinese CLIP, with ViT-B/16 as the image encoder and RoBERTa-wwm-base as the text encoder. Chinese CLIP is a simple implementation of CLIP on a large-scale dataset of around 200 million Chinese image-text pairs. For more details, please refer to our technical report https://arxiv.org/abs/2211.01335 and our official github repo https://github.com/OFA-Sys/Chinese-CLIP ## Use with the official API -We provide a simple code snippet to show how to use the API for Chinese-CLIP. For starters, please install cn_clip: -```bash -# to install the latest stable release -pip install cn_clip +We provide a simple code snippet to show how to use the API of Chinese-CLIP to compute the image & text embeddings and similarities. -# or install from source code -cd Chinese-CLIP -pip install -e . -``` -After installation, use Chinese CLIP as shown below: ```python -import torch from PIL import Image +import requests +from transformers import ChineseCLIPProcessor, ChineseCLIPModel -import cn_clip.clip as clip -from cn_clip.clip import load_from_name, available_models -print("Available models:", available_models()) -# Available models: ['ViT-B-16', 'ViT-L-14', 'ViT-L-14-336', 'ViT-H-14', 'RN50'] +model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") +processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") -device = "cuda" if torch.cuda.is_available() else "cpu" -model, preprocess = load_from_name("ViT-B-16", device=device, download_root='./') -model.eval() -image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0).to(device) -text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]).to(device) +url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" +image = Image.open(requests.get(url, stream=True).raw) +# Squirtle, Bulbasaur, Charmander, Pikachu in English +texts = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"] -with torch.no_grad(): - image_features = model.encode_image(image) - text_features = model.encode_text(text) - # Normalize the features. Please use the normalized features for downstream tasks. - image_features /= image_features.norm(dim=-1, keepdim=True) - text_features /= text_features.norm(dim=-1, keepdim=True) +# compute image feature +inputs = processor(images=image, return_tensors="pt") +image_features = model.get_image_features(**inputs) +image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) # normalize - logits_per_image, logits_per_text = model.get_similarity(image, text) - probs = logits_per_image.softmax(dim=-1).cpu().numpy() +# compute text features +inputs = processor(text=texts, padding=True, return_tensors="pt") +text_features = model.get_text_features(**inputs) +text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) # normalize -print("Label probs:", probs) # [[1.268734e-03 5.436878e-02 6.795761e-04 9.436829e-01]] +# compute image-text similarity scores +inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) +outputs = model(**inputs) +logits_per_image = outputs.logits_per_image # this is the image-text similarity score +probs = logits_per_image.softmax(dim=1) # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]] ``` However, if you are not satisfied with only using the API, feel free to check our github repo https://github.com/OFA-Sys/Chinese-CLIP for more details about training and inference.