Compare commits
10 Commits
5975e4222c
...
e6a30b603a
Author | SHA1 | Date |
---|---|---|
|
e6a30b603a | |
|
f330785f9a | |
|
f4881ba48e | |
|
8759caf01b | |
|
27464f12df | |
|
7acb5c9a46 | |
|
4b75373442 | |
|
1b010d83b8 | |
|
35754e5fa9 | |
|
8092f5b35a |
20
README.md
20
README.md
|
@ -1,5 +1,16 @@
|
|||
---
|
||||
tags:
|
||||
- vision
|
||||
widget:
|
||||
- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
|
||||
candidate_labels: playing music, playing sports
|
||||
example_title: Cat & Dog
|
||||
---
|
||||
|
||||
# Model Card: CLIP
|
||||
|
||||
Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md).
|
||||
|
||||
## Model Details
|
||||
|
||||
The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
|
||||
|
@ -10,15 +21,10 @@ January 2021
|
|||
|
||||
### Model Type
|
||||
|
||||
The base model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer.
|
||||
The model uses a ViT-B/32 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss.
|
||||
|
||||
### Model Version
|
||||
The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer.
|
||||
|
||||
Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50.
|
||||
|
||||
*This port does not include the ResNet model.*
|
||||
|
||||
Please see the paper linked below for further details about their specification.
|
||||
|
||||
### Documents
|
||||
|
||||
|
|
14
config.json
14
config.json
|
@ -1,8 +1,10 @@
|
|||
{
|
||||
"_name_or_path": "openai/clip-vit-base-patch32",
|
||||
"architectures": [
|
||||
"CLIPModel"
|
||||
],
|
||||
"initializer_factor": 1.0,
|
||||
"logit_scale_init_value": 2.6592,
|
||||
"model_type": "clip",
|
||||
"projection_dim": 512,
|
||||
"text_config": {
|
||||
|
@ -13,6 +15,7 @@
|
|||
"bad_words_ids": null,
|
||||
"bos_token_id": 0,
|
||||
"chunk_size_feed_forward": 0,
|
||||
"cross_attention_hidden_size": null,
|
||||
"decoder_start_token_id": null,
|
||||
"diversity_penalty": 0.0,
|
||||
"do_sample": false,
|
||||
|
@ -23,7 +26,6 @@
|
|||
"finetuning_task": null,
|
||||
"forced_bos_token_id": null,
|
||||
"forced_eos_token_id": null,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "quick_gelu",
|
||||
"hidden_size": 512,
|
||||
"id2label": {
|
||||
|
@ -56,6 +58,7 @@
|
|||
"output_scores": false,
|
||||
"pad_token_id": 1,
|
||||
"prefix": null,
|
||||
"projection_dim": 512,
|
||||
"problem_type": null,
|
||||
"pruned_heads": {},
|
||||
"remove_invalid_values": false,
|
||||
|
@ -70,8 +73,9 @@
|
|||
"tokenizer_class": null,
|
||||
"top_k": 50,
|
||||
"top_p": 1.0,
|
||||
"torch_dtype": null,
|
||||
"torchscript": false,
|
||||
"transformers_version": "4.6.0.dev0",
|
||||
"transformers_version": "4.16.0.dev0",
|
||||
"use_bfloat16": false,
|
||||
"vocab_size": 49408
|
||||
},
|
||||
|
@ -85,6 +89,7 @@
|
|||
"bad_words_ids": null,
|
||||
"bos_token_id": null,
|
||||
"chunk_size_feed_forward": 0,
|
||||
"cross_attention_hidden_size": null,
|
||||
"decoder_start_token_id": null,
|
||||
"diversity_penalty": 0.0,
|
||||
"do_sample": false,
|
||||
|
@ -95,7 +100,6 @@
|
|||
"finetuning_task": null,
|
||||
"forced_bos_token_id": null,
|
||||
"forced_eos_token_id": null,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "quick_gelu",
|
||||
"hidden_size": 768,
|
||||
"id2label": {
|
||||
|
@ -129,6 +133,7 @@
|
|||
"pad_token_id": null,
|
||||
"patch_size": 32,
|
||||
"prefix": null,
|
||||
"projection_dim" : 512,
|
||||
"problem_type": null,
|
||||
"pruned_heads": {},
|
||||
"remove_invalid_values": false,
|
||||
|
@ -143,8 +148,9 @@
|
|||
"tokenizer_class": null,
|
||||
"top_k": 50,
|
||||
"top_p": 1.0,
|
||||
"torch_dtype": null,
|
||||
"torchscript": false,
|
||||
"transformers_version": "4.6.0.dev0",
|
||||
"transformers_version": "4.16.0.dev0",
|
||||
"use_bfloat16": false
|
||||
},
|
||||
"vision_config_dict": null
|
||||
|
|
Binary file not shown.
Binary file not shown.
98394
tokenizer.json
98394
tokenizer.json
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue