add dit-document-layout-analysis

This commit is contained in:
songw 2023-04-06 16:51:38 +08:00
parent 0dbe302a4b
commit b06fcd781c
6 changed files with 186 additions and 0 deletions

View File

@ -0,0 +1,69 @@
MODEL:
MASK_ON: True
META_ARCHITECTURE: "GeneralizedRCNN"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
BACKBONE:
NAME: "build_vit_fpn_backbone"
VIT:
OUT_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
DROP_PATH: 0.1
IMG_SIZE: [224,224]
POS_TYPE: "abs"
FPN:
IN_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 5
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("publaynet_train",)
TEST: ("publaynet_val",)
SOLVER:
LR_SCHEDULER_NAME: "WarmupCosineLR"
AMP:
ENABLED: True
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 1.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
WARMUP_FACTOR: 0.01
BASE_LR: 0.0004
WEIGHT_DECAY: 0.05
IMS_PER_BATCH: 32
INPUT:
CROP:
ENABLED: True
TYPE: "absolute_range"
SIZE: (384, 600)
MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
FORMAT: "RGB"
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
VERSION: 2
AUG:
DETR: True
SEED: 42

View File

@ -0,0 +1,68 @@
import os
import sys
sys.path.append("unilm")
import cv2
from unilm.dit.object_detection.ditod import add_vit_config
import torch
from detectron2.config import CfgNode as CN
from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor
import gradio as gr
# Step 1: instantiate config
cfg = get_cfg()
add_vit_config(cfg)
cfg.merge_from_file("cascade_dit_base.yml")
# Step 2: add model weights URL to config
cfg.MODEL.WEIGHTS = "https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_cascade.pth"
# Step 3: set device
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Step 4: define model
predictor = DefaultPredictor(cfg)
def analyze_image(img):
md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
if cfg.DATASETS.TEST[0]=='icdar2019_test':
md.set(thing_classes=["table"])
else:
md.set(thing_classes=["text","title","list","table","figure"])
output = predictor(img)["instances"]
v = Visualizer(img[:, :, ::-1],
md,
scale=1.0,
instance_mode=ColorMode.SEGMENTATION)
result = v.draw_instance_predictions(output.to("cpu"))
result_image = result.get_image()[:, :, ::-1]
return result_image
title = "Interactive demo: Document Layout Analysis with DiT"
description = "Demo for Microsoft's DiT, the Document Image Transformer for state-of-the-art document understanding tasks. This particular model is fine-tuned on PubLayNet, a large dataset for document layout analysis (read more at the links below). To use it, simply upload an image or use the example image below and click 'Submit'. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select 'Open image in new tab'."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2203.02378' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/unilm/tree/master/dit' target='_blank'>Github Repo</a></p> | <a href='https://huggingface.co/docs/transformers/master/en/model_doc/dit' target='_blank'>HuggingFace doc</a></p>"
examples =[['publaynet_example.jpeg']]
css = ".output-image, .input-image, .image-preview {height: 600px !important}"
iface = gr.Interface(fn=analyze_image,
inputs=gr.inputs.Image(type="numpy", label="document image"),
outputs=gr.outputs.Image(type="numpy", label="annotated document"),
title=title,
description=description,
examples=examples,
article=article,
css=css,
enable_queue=True)
iface.launch()

View File

@ -0,0 +1,27 @@
# FROM python:3.9
# COPY . /app
# WORKDIR /app
# RUN pip install --trusted-host pypi.python.org -r requirements.txt
# CMD python app.py
# Use an official Python runtime as a parent image
FROM python:3.7.4-slim
RUN sed -i 's#http://deb.debian.org#https://mirrors.ustc.edu.cn#g' /etc/apt/sources.list && sed -i 's|security.debian.org/debian-security|mirrors.ustc.edu.cn/debian-security|g' /etc/apt/sources.list
WORKDIR /app
COPY requirements.txt /app
RUN pip config set global.index-url https://pypi.mirrors.ustc.edu.cn/simple/
RUN apt update && apt install -y libgl1-mesa-glx libglib2.0-0 build-essential
RUN pip3 install --trusted-host pypi.python.org -r requirements.txt
RUN python -m pip install detectron2 -f \
https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
# Set the working directory to /app
# Copy the current directory contents into the container at /app
COPY . /app
# Run main.py when the container launches
CMD ["python", "DiT.py"]

View File

@ -0,0 +1,20 @@
_BASE_: "Base-RCNN-FPN.yml"
MODEL:
PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
PIXEL_STD: [ 127.5, 127.5, 127.5 ]
WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
VIT:
NAME: "dit_base_patch16"
ROI_HEADS:
NAME: CascadeROIHeads
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
WARMUP_ITERS: 1000
IMS_PER_BATCH: 16
MAX_ITER: 60000
CHECKPOINT_PERIOD: 2000
TEST:
EVAL_PERIOD: 2000

Binary file not shown.

After

Width:  |  Height:  |  Size: 319 KiB

View File

@ -0,0 +1,2 @@
gradio
opencv-python