microsoft
diff --git a/‎UniHDSA/README.md
Lines changed: 9 additions & 0 deletions b/‎UniHDSA/README.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎UniHDSA/configs/data/hdsa_bert.py
Lines changed: 69 additions & 0 deletions b/‎UniHDSA/configs/data/hdsa_bert.py
Lines changed: 69 additions & 0 deletions
diff --git a/‎UniHDSA/configs/models/unihdsa_r18_bert.py
Lines changed: 186 additions & 0 deletions b/‎UniHDSA/configs/models/unihdsa_r18_bert.py
Lines changed: 186 additions & 0 deletions
@@ -0,0 +1,9 @@
+# UniHDSA: A Unified Relation Prediction Approach for Hierarchical Document Structure Analysis
+
+## Introduction
+
+Document structure analysis is essential for understanding both the physical layout and logical structure of documents, aiding in tasks such as information retrieval, document summarization, and knowledge extraction. Hierarchical Document Structure Analysis (HDSA) aims to restore the hierarchical structure of documents created with hierarchical schemas. Traditional approaches either focus on specific subtasks in isolation or use multiple branches to address distinct tasks. In this work, we introduce UniHDSA, a unified relation prediction approach for HDSA that treats various subtasks as relation prediction problems within a consolidated label space. This allows a single module to handle multiple tasks simultaneously, improving efficiency, scalability, and adaptability. Our multimodal Transformer-based system demonstrates state-of-the-art performance on the Comp-HRDoc benchmark and competitive results on the DocLayNet dataset, showcasing the effectiveness of our method across all subtasks.
+
+## Reproduction
+
+This project is built on [detrex](https://github.com/IDEA-Research/detrex/tree/main), a library for computer vision. Due to company policy, we cannot release the code for the model. However, we provide the detailed configuration including the model architecture, training hyperparameters, and data processing methods. We also provide the code for the evaluation of the model.
@@ -0,0 +1,69 @@
+from omegaconf import OmegaConf
+
+import detectron2.data.transforms as T
+from detectron2.config import LazyCall as L
+from detectron2.data import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+)
+from projects.unified_layout_analysis_v2.evaluation.unified_layout_evaluation import UniLayoutEvaluator
+from projects.unified_layout_analysis_v2.modeling.backbone.bert import TextTokenizer
+
+from detrex.data.dataset_mappers import PODDatasetMapper, pod_transform_gen
+from detrex.data.dataset_mappers import HRDocDatasetMapper
+
+dataloader = OmegaConf.create()
+
+dataloader.train = L(build_detection_train_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="COMP_HRDOC_HR_TRAIN"),
+    mapper=L(HRDocDatasetMapper)(
+        augmentation=L(pod_transform_gen)(
+            min_size_train=(320, 416, 512, 608, 704, 800),
+            max_size_train=1024,
+            min_size_train_sampling="choice",
+            min_size_test=512,
+            max_size_test=1024,
+            random_resize_type="ResizeShortestEdge",
+            random_flip=False,
+            is_train=True,
+        ),
+        TextTokenizer=L(TextTokenizer)(
+            model_type="bert-base-uncased",
+            text_max_len=512,
+            input_overlap_stride=0,
+        ),
+        is_train=True,
+        image_format="BGR",
+    ),
+    total_batch_size=1,
+    num_workers=4,
+)
+
+dataloader.test = L(build_detection_test_loader)(
+    dataset=L(get_detection_dataset_dicts)(names="COMP_HRDOC_HR_TEST"),
+    mapper=L(HRDocDatasetMapper)(
+        augmentation=L(pod_transform_gen)(
+            min_size_train=(320, 416, 512, 608, 704, 800),
+            max_size_train=1024,
+            min_size_train_sampling="choice",
+            min_size_test=512,
+            max_size_test=1024,
+            random_resize_type="ResizeShortestEdge",
+            random_flip=False,
+            is_train=False,
+        ),
+        TextTokenizer=L(TextTokenizer)(
+            model_type="bert-base-uncased",
+            text_max_len=512,
+            input_overlap_stride=0,
+        ),
+        is_train=False,
+        image_format="BGR",
+    ),
+    num_workers=4,
+)
+
+dataloader.evaluator = L(UniLayoutEvaluator)(
+    dataset_name="${..test.dataset.names}",
+)
@@ -0,0 +1,186 @@
+import torch.nn as nn
+import copy
+from detrex.layers import PositionEmbeddingSine
+from detrex.modeling.backbone import ResNet, BasicStem
+from detrex.modeling.neck import ChannelMapper
+from detectron2.layers import ShapeSpec
+from detectron2.config import LazyCall as L
+from detrex.modeling.matcher import HungarianMatcher
+
+from projects.unified_layout_analysis_v2.modeling import (
+    UniDETRMultiScales,
+    DabDeformableDetrTransformer,
+    DabDeformableDetrTransformerEncoder,
+    DabDeformableDetrTransformerDecoder,
+    TwoStageCriterion,
+    DeepStem,
+)
+
+from projects.unified_layout_analysis_v2.modeling.uni_relation_prediction_head import (
+    UniRelationPredictionHead,
+    HRIPNHead
+)
+
+from projects.unified_layout_analysis_v2.modeling.doc_transformer import (
+    DocTransformerEncoder,
+    DocTransformer
+)
+
+from projects.unified_layout_analysis_v2.modeling.backbone.bert import (
+    Bert,
+    TextTokenizer
+)
+
+# Define the main model
+model = L(UniDETRMultiScales)(
+    backbone=L(ResNet)(
+        stem=L(DeepStem)(in_channels=3, out_channels=64, norm="FrozenBN"),
+        stages=L(ResNet.make_default_stages)(
+            depth=18,
+            norm="FrozenBN",
+        ),
+        out_features=["res2", "res3", "res4", "res5"],
+        freeze_at=1,
+    ),
+    position_embedding=L(PositionEmbeddingSine)(
+        num_pos_feats=128,
+        temperature=10000,
+        normalize=True,
+        offset=-0.5,
+    ),
+    neck=L(ChannelMapper)(
+        input_shapes={
+            "res3": ShapeSpec(channels=128),
+            "res4": ShapeSpec(channels=256),
+            "res5": ShapeSpec(channels=512),
+        },
+        in_features=["res3", "res4", "res5"],
+        out_channels=256,
+        num_outs=4,
+        kernel_size=1,
+        norm_layer=L(nn.GroupNorm)(num_groups=32, num_channels=256),
+    ),
+    transformer=L(DabDeformableDetrTransformer)(
+        encoder=L(DabDeformableDetrTransformerEncoder)(
+            embed_dim=256,
+            num_heads=8,
+            feedforward_dim=2048,
+            attn_dropout=0.0,
+            ffn_dropout=0.0,
+            num_layers=3,
+            post_norm=False,
+            num_feature_levels=4,
+        ),
+        decoder=L(DabDeformableDetrTransformerDecoder)(
+            embed_dim=256,
+            num_heads=8,
+            feedforward_dim=2048,
+            attn_dropout=0.0,
+            ffn_dropout=0.0,
+            num_layers=3,
+            return_intermediate=True,
+            num_feature_levels=4,
+        ),
+        as_two_stage=True,
+        num_feature_levels=4,
+        decoder_in_feature_level=[0, 1, 2, 3],
+    ),
+    embed_dim=256,
+    num_classes=14,
+    num_graphical_classes=2,
+    num_types=3,
+    relation_prediction_head=L(UniRelationPredictionHead)(
+        relation_num_classes=2,
+        embed_dim=256,
+        hidden_dim=1024,
+    ), # 0: a->a, 1: intra, 2: inter
+    aux_loss=True,
+    criterion=L(TwoStageCriterion)(
+        num_classes=2,
+        matcher=L(HungarianMatcher)(
+            cost_class=2.0,
+            cost_bbox=5.0,
+            cost_giou=2.0,
+            cost_class_type="focal_loss_cost",
+            alpha=0.25,
+            gamma=2.0,
+        ),
+        weight_dict={
+            "loss_class": 1,
+            "loss_bbox": 5.0,
+            "loss_giou": 2.0,
+        },
+        loss_class_type="focal_loss",
+        alpha=0.25,
+        gamma=2.0,
+        two_stage_binary_cls=False,
+    ),
+    as_two_stage=True,
+    pixel_mean=[123.675, 116.280, 103.530],
+    pixel_std=[58.395, 57.120, 57.375],
+    device="cuda",
+    windows_size=[6,8],
+    freeze_language_model=False,
+)
+
+model.logical_role_relation_prediction_head=L(UniRelationPredictionHead)(
+    relation_num_classes=1,
+    embed_dim=256,
+    hidden_dim=1024,
+)
+
+# Update auxiliary loss weight dictionary
+base_weight_dict = copy.deepcopy(model.criterion.weight_dict)
+if model.aux_loss:
+    weight_dict = model.criterion.weight_dict
+    aux_weight_dict = {f"{k}_{i}": v for i in range(model.transformer.decoder.num_layers - 1) for k, v in base_weight_dict.items()}
+    weight_dict.update(aux_weight_dict)
+    model.criterion.weight_dict = weight_dict
+
+# Additional loss weight updates
+model.criterion.weight_dict.update({
+    "loss_class_enc": 1.0,
+    "loss_bbox_enc": 5.0,
+    "loss_giou_enc": 2.0,
+})
+
+# Add document transformer module
+model.doc_transformer = L(DocTransformer)(
+    encoder=L(DocTransformerEncoder)(
+        embed_dim=256,
+        num_heads=8,
+        feedforward_dim=2048,
+        attn_dropout=0.0,
+        ffn_dropout=0.0,
+        num_layers=3,
+        post_norm=False,
+        batch_first=True,
+    ),
+    decoder=None,
+)
+
+# Add relation prediction head
+model.doc_relation_prediction_head = L(HRIPNHead)(
+    relation_num_classes=2,
+    embed_dim=256,
+    hidden_dim=1024,
+)
+
+# Add language model
+model.language_model = L(Bert)(
+    bert_model_type="bert-base-uncased",
+    text_max_len=512,
+    input_overlap_stride=0,
+    output_embedding_dim=1024,
+    max_batch_size=1,
+    used_layers=12,
+    used_hidden_idxs=[12],
+    hidden_embedding_dim=768,
+)
+
+# Add tokenizer
+model.tokenizer = L(TextTokenizer)(
+    model_type="bert-base-uncased",
+    text_max_len=512,
+    input_overlap_stride=0,
+)