5 changed files with 12 additions and 98422 deletions
--- a/README.md
+++ b/README.md
@ -1,16 +1,5 @@
---
-tags:
- vision
-widget:
- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
-  candidate_labels: playing music, playing sports
-  example_title: Cat & Dog
---
-
 # Model Card: CLIP

-Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md).
-
 ## Model Details

 The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
@ -21,10 +10,15 @@ January 2021

 ### Model Type

-The model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. 
+The base model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer.

-The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer.
+### Model Version

+Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50.
+
+*This port does not include the ResNet model.*
+
+Please see the paper linked below for further details about their specification.

 ### Documents

--- a/config.json
+++ b/config.json
@ -1,10 +1,8 @@
 {
-  "_name_or_path": "openai/clip-vit-base-patch32",
  "architectures": [
    "CLIPModel"
  ],
  "initializer_factor": 1.0,
-  "logit_scale_init_value": 2.6592,
  "model_type": "clip",
  "projection_dim": 512,
  "text_config": {
@ -15,7 +13,6 @@
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
@ -26,6 +23,7 @@
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
+    "gradient_checkpointing": false,
    "hidden_act": "quick_gelu",
    "hidden_size": 512,
    "id2label": {
@ -58,7 +56,6 @@
    "output_scores": false,
    "pad_token_id": 1,
    "prefix": null,
-    "projection_dim": 512,
    "problem_type": null,
    "pruned_heads": {},
    "remove_invalid_values": false,
@ -73,9 +70,8 @@
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
-    "torch_dtype": null,
    "torchscript": false,
-    "transformers_version": "4.16.0.dev0",
+    "transformers_version": "4.6.0.dev0",
    "use_bfloat16": false,
    "vocab_size": 49408
  },
@ -89,7 +85,6 @@
    "bad_words_ids": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
@ -100,6 +95,7 @@
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
+    "gradient_checkpointing": false,
    "hidden_act": "quick_gelu",
    "hidden_size": 768,
    "id2label": {
@ -133,7 +129,6 @@
    "pad_token_id": null,
    "patch_size": 32,
    "prefix": null,
-    "projection_dim" : 512,
    "problem_type": null,
    "pruned_heads": {},
    "remove_invalid_values": false,
@ -148,9 +143,8 @@
    "tokenizer_class": null,
    "top_k": 50,
    "top_p": 1.0,
-    "torch_dtype": null,
    "torchscript": false,
-    "transformers_version": "4.16.0.dev0",
+    "transformers_version": "4.6.0.dev0",
    "use_bfloat16": false
  },
  "vision_config_dict": null
--- a/flax_model.msgpack
+++ b/flax_model.msgpack
--- a/tf_model.h5
+++ b/tf_model.h5
--- a/tokenizer.json
+++ b/tokenizer.json