Add widget example input (#2 )

- Add widget example input (220961d9a4200b62dc17ed2411394d105baa5b13)
Update config.json
2022-10-04 09:42:04 +00:00 · 2022-09-09 17:31:01 +00:00 · 2022-03-14 17:58:13 +00:00 · 2022-02-18 19:28:11 +01:00 · 2022-01-05 13:19:16 +00:00 · 2021-06-07 12:27:48 +00:00
5 changed files with 98422 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,16 @@
+---
+tags:
+- vision
+widget:
+- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
+  candidate_labels: playing music, playing sports
+  example_title: Cat & Dog
+---
+
 # Model Card: CLIP

+Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md).
+
 ## Model Details

 The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
@ -10,15 +21,10 @@ January 2021

 ### Model Type

-The base model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer.
+The model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. 

-### Model Version
+The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer.

-Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50.
-
-*This port does not include the ResNet model.*
-
-Please see the paper linked below for further details about their specification.

 ### Documents

--- a/config.json
+++ b/config.json
@ -1,8 +1,10 @@
 {
+  "_name_or_path": "openai/clip-vit-base-patch32",
  "architectures": [
    "CLIPModel"
  ],
  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
  "model_type": "clip",
  "projection_dim": 512,
  "text_config": {
@ -13,6 +15,7 @@
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
@ -23,7 +26,6 @@
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
-    "gradient_checkpointing": false,
    "hidden_act": "quick_gelu",
    "hidden_size": 512,
    "id2label": {
@ -56,6 +58,7 @@
    "output_scores": false,
    "pad_token_id": 1,
    "prefix": null,
+    "projection_dim": 512,
    "problem_type": null,
    "pruned_heads": {},
    "remove_invalid_values": false,
@ -70,8 +73,9 @@
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
+    "torch_dtype": null,
    "torchscript": false,
-    "transformers_version": "4.6.0.dev0",
+    "transformers_version": "4.16.0.dev0",
    "use_bfloat16": false,
    "vocab_size": 49408
  },
@ -85,6 +89,7 @@
    "bad_words_ids": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
@ -95,7 +100,6 @@
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
-    "gradient_checkpointing": false,
    "hidden_act": "quick_gelu",
    "hidden_size": 768,
    "id2label": {
@ -129,6 +133,7 @@
    "pad_token_id": null,
    "patch_size": 32,
    "prefix": null,
+    "projection_dim" : 512,
    "problem_type": null,
    "pruned_heads": {},
    "remove_invalid_values": false,
@ -143,8 +148,9 @@
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
+    "torch_dtype": null,
    "torchscript": false,
-    "transformers_version": "4.6.0.dev0",
+    "transformers_version": "4.16.0.dev0",
    "use_bfloat16": false
  },
  "vision_config_dict": null
--- a/flax_model.msgpack
+++ b/flax_model.msgpack
--- a/tf_model.h5
+++ b/tf_model.h5
--- a/tokenizer.json
+++ b/tokenizer.json
Author	SHA1	Message	Date
Mishig Davaadorj	e6a30b603a	Add widget example input (#2 ) - Add widget example input (220961d9a4200b62dc17ed2411394d105baa5b13)	2022-10-04 09:42:04 +00:00
Suraj Patil	f330785f9a	Update config.json	2022-09-09 17:31:01 +00:00
Suraj Patil	f4881ba48e	Update README.md	2022-03-14 17:58:13 +00:00
SaulLu	8759caf01b	update `tokenizer.json` for new CLIPTokenizerFast class	2022-02-18 19:28:11 +01:00
patil-suraj	27464f12df	add tf model	2022-01-05 13:19:16 +00:00
Suraj Patil	7acb5c9a46	Update README.md	2021-06-07 12:27:48 +00:00
Suraj Patil	4b75373442	Update README.md	2021-06-07 12:27:26 +00:00
patil-suraj	1b010d83b8	add model	2021-05-26 09:53:08 +00:00
Suraj Patil	35754e5fa9	Update README.md	2021-05-12 08:33:47 +00:00
Suraj Patil	8092f5b35a	Update README.md	2021-05-12 08:33:20 +00:00