import gradio as gr from transformers import AutoProcessor, LayoutLMv2ForQuestionAnswering, set_seed import torch from PIL import Image set_seed(88) processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased") def vqa(image, question): inp = Image.fromarray(image.astype('uint8'), 'RGB') encoding = processor(inp, question, return_tensors="pt") outputs = model(**encoding) predicted_start_idx = outputs.start_logits.argmax(-1).item() predicted_end_idx = outputs.end_logits.argmax(-1).item() predicted_start_idx, predicted_end_idx predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1] predicted_answer = processor.tokenizer.decode(predicted_answer_tokens) return predicted_answer demo = gr.Interface(fn=vqa, inputs=['image', 'text'], outputs='text', title = "vqa", examples = [['income.png', 'What are the 2020 net sales?'], ['invoice.png','What is the invoice number?']]) if __name__ == "__main__": demo.queue(concurrency_count=3).launch(server_name = "0.0.0.0", server_port = 7026)