add files

This commit is contained in:
ydshieh 2021-10-24 17:23:41 +00:00
parent 9d3e38f053
commit c1c837b30e
15 changed files with 150862 additions and 0 deletions

46
README.md Normal file
View File

@ -0,0 +1,46 @@
---
tags:
- image-classification
library_name: generic
---
## Example
The model is by no means a state-of-the-art model, but nevertheless
produces reasonable image captioning results. It was mainly fine-tuned
as a proof-of-concept for the 🤗 FlaxVisionEncoderDecoder Framework.
The model can be used as follows:
```python
import requests
from PIL import Image
from transformers import ViTFeatureExtractor, AutoTokenizer, FlaxVisionEncoderDecoderModel
loc = "ydshieh/vit-gpt2-coco-en"
feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
tokenizer = AutoTokenizer.from_pretrained(loc)
model = FlaxVisionEncoderDecoderModel.from_pretrained(loc)
# We will verify our results on an image of cute cats
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
with Image.open(requests.get(url, stream=True).raw) as img:
pixel_values = feature_extractor(images=img, return_tensors="np").pixel_values
def generate_step(pixel_values):
output_ids = model.generate(pixel_values, max_length=16, num_beams=4).sequences
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
return preds
preds = generate_step(pixel_values)
print(preds)
# should produce
# ['a cat laying on top of a couch next to another cat']
```

169
config.json Normal file
View File

@ -0,0 +1,169 @@
{
"architectures": [
"VisionEncoderDecoderModel"
],
"bos_token_id": 50256,
"decoder": {
"_name_or_path": "",
"activation_function": "gelu_new",
"add_cross_attention": true,
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bad_words_ids": null,
"bos_token_id": 50256,
"chunk_size_feed_forward": 0,
"decoder_start_token_id": 50256,
"diversity_penalty": 0.0,
"do_sample": false,
"early_stopping": false,
"embd_pdrop": 0.1,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": 50256,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"initializer_range": 0.02,
"is_decoder": true,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_epsilon": 1e-05,
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "gpt2",
"n_ctx": 1024,
"n_embd": 768,
"n_head": 12,
"n_inner": null,
"n_layer": 12,
"n_positions": 1024,
"no_repeat_ngram_size": 0,
"num_beam_groups": 1,
"num_beams": 1,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": 50256,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"resid_pdrop": 0.1,
"return_dict": true,
"return_dict_in_generate": false,
"scale_attn_weights": true,
"sep_token_id": null,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 50
}
},
"temperature": 1.0,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"transformers_version": "4.11.0.dev0",
"use_bfloat16": false,
"use_cache": true,
"vocab_size": 50257
},
"decoder_start_token_id": 50256,
"encoder": {
"_name_or_path": "",
"add_cross_attention": false,
"architectures": [
"ViTModel"
],
"attention_probs_dropout_prob": 0.0,
"bad_words_ids": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"hidden_size": 768,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"image_size": 224,
"initializer_range": 0.02,
"intermediate_size": 3072,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-12,
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "vit",
"no_repeat_ngram_size": 0,
"num_attention_heads": 12,
"num_beam_groups": 1,
"num_beams": 1,
"num_channels": 3,
"num_hidden_layers": 12,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"patch_size": 16,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"task_specific_params": null,
"temperature": 1.0,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"transformers_version": "4.11.0.dev0",
"use_bfloat16": false
},
"eos_token_id": 50256,
"is_encoder_decoder": true,
"model_type": "vision-encoder-decoder",
"pad_token_id": 50256,
"transformers_version": null
}

BIN
events.out.tfevents.1633443513.t1v-n-bb5dfd23-w-0.8655.0.v2 (Stored with Git LFS) Normal file

Binary file not shown.

BIN
flax_model.msgpack (Stored with Git LFS) Normal file

Binary file not shown.

99330
generation_eval.json Normal file

File diff suppressed because it is too large Load Diff

50001
merges.txt Normal file

File diff suppressed because it is too large Load Diff

48
pipeline.py Normal file
View File

@ -0,0 +1,48 @@
import os
from typing import Dict, List, Any
from PIL import Image
import jax
from transformers import ViTFeatureExtractor, AutoTokenizer, FlaxVisionEncoderDecoderModel
class PreTrainedPipeline():
def __init__(self, path=""):
model_dir = os.path.join(path, "ckpt_epoch_3_step_6900")
self.model = FlaxVisionEncoderDecoderModel.from_pretrained(model_dir)
self.feature_extractor = ViTFeatureExtractor.from_pretrained(model_dir)
self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
max_length = 16
num_beams = 4
self.gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
@jax.jit
def _generate(pixel_values):
output_ids = self.model.generate(pixel_values, **self.gen_kwargs).sequences
return output_ids
self.generate = _generate
# compile the model
image_path = os.path.join(path, 'val_000000039769.jpg')
image = Image.open(image_path)
self(image)
image.close()
def __call__(self, inputs: "Image.Image") -> List[str]:
"""
Args:
Return:
"""
pixel_values = self.feature_extractor(images=inputs, return_tensors="np").pixel_values
output_ids = self.generate(pixel_values)
preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
return preds

15
preprocessor_config.json Normal file
View File

@ -0,0 +1,15 @@
{
"do_normalize": true,
"do_resize": true,
"image_mean": [
0.5,
0.5,
0.5
],
"image_std": [
0.5,
0.5,
0.5
],
"size": 224
}

1239
report.txt Normal file

File diff suppressed because it is too large Load Diff

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
Pillow
jax[cpu]
flax
git+https://github.com/ydshieh/transformers.git@flax_vision_encoder_decoder

1
special_tokens_map.json Normal file
View File

@ -0,0 +1 @@
{"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}

1
tokenizer.json Normal file

File diff suppressed because one or more lines are too long

1
tokenizer_config.json Normal file
View File

@ -0,0 +1 @@
{"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}

BIN
val_000000039769.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 169 KiB

1
vocab.json Normal file

File diff suppressed because one or more lines are too long