init

LibertFan · Nov 5, 2021 · c86c2b7 · c86c2b7
commit c86c2b7
Show file tree

Hide file tree

Showing 65 changed files with 7,728 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,38 @@
+FROM nvcr.io/nvidia/pytorch:19.05-py3
+
+# basic python packages
+RUN pip install pytorch-pretrained-bert==0.6.2 \
+                tensorboardX==1.7 ipdb==0.12 lz4==2.1.9 lmdb==0.97
+
+####### horovod for multi-GPU (distributed) training #######
+
+# update OpenMPI to avoid horovod bug
+RUN rm -r /usr/local/mpi &&\ 
+    wget https://download.open-mpi.org/release/open-mpi/v3.1/openmpi-3.1.4.tar.gz &&\
+    gunzip -c openmpi-3.1.4.tar.gz | tar xf - &&\
+    cd openmpi-3.1.4 &&\
+    ./configure --prefix=/usr/local/mpi --enable-orterun-prefix-by-default \
+        --with-verbs --disable-getpwuid &&\
+    make -j$(nproc) all && make install &&\
+    ldconfig &&\
+    cd - && rm -r openmpi-3.1.4 && rm openmpi-3.1.4.tar.gz
+
+ENV OPENMPI_VERSION=3.1.4
+
+# horovod
+RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITH_PYTORCH=1 \
+    pip install --no-cache-dir horovod==0.16.4 &&\
+    ldconfig
+
+# ssh
+RUN apt-get update &&\
+    apt-get install -y --no-install-recommends openssh-client openssh-server &&\
+    mkdir -p /var/run/sshd
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+
+WORKDIR /src
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,146 @@
+# NSGDC
+
+Some codes in this repo are copied/modified from opensource implementations made available by
+[UNITER](https://github.com/ChenRocks/UNITER),
+[PyTorch](https://github.com/pytorch/pytorch),
+[HuggingFace](https://github.com/huggingface/transformers),
+[OpenNMT](https://github.com/OpenNMT/OpenNMT-py),
+and [Nvidia](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch).
+The image features are extracted using [BUTD](https://github.com/peteanderson80/bottom-up-attention).
+
+
+## Requirements
+This is following UNITER. We provide Docker image for easier reproduction. Please install the following:
+  - [nvidia driver](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation) (418+), 
+  - [Docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/) (19.03+), 
+  - [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-docker#quickstart).
+
+Our scripts require the user to have the [docker group membership](https://docs.docker.com/install/linux/linux-postinstall/)
+so that docker commands can be run without sudo.
+We only support Linux with NVIDIA GPUs. We test on Ubuntu 18.04 and V100 cards.
+We use mixed-precision training hence GPUs with Tensor Cores are recommended.
+
+## Image-Text Retrieval
+### Download Data
+```
+bash scripts/download_itm.sh $PATH_TO_STORAGE
+```
+
+### Launch the Docker Container
+```bash
+# docker image should be automatically pulled
+source launch_container.sh $PATH_TO_STORAGE/txt_db $PATH_TO_STORAGE/img_db \
+$PATH_TO_STORAGE/finetune $PATH_TO_STORAGE/pretrained
+```
+
+In case you would like to reproduce the whole preprocessing pipeline.
+
+The launch script respects $CUDA_VISIBLE_DEVICES environment variable.
+Note that the source code is mounted into the container under `/src` instead 
+of built into the image so that user modification will be reflected without
+re-building the image. (Data folders are mounted into the container separately
+for flexibility on folder structures.)
+
+
+### Image-Text Retrieval (Flickr30k)
+```
+# Train wit the base setting
+bash run_cmds/tran_pnsgd_base_flickr.sh
+bash run_cmds/tran_pnsgd2_base_flickr.sh
+
+# Train wit the large setting
+bash run_cmds/tran_pnsgd_large_flickr.sh
+bash run_cmds/tran_pnsgd2_large_flickr.sh
+```
+
+### Image-Text Retrieval (COCO)
+```
+# Train wit the base setting
+bash run_cmds/tran_pnsgd_base_coco.sh
+bash run_cmds/tran_pnsgd2_base_coco.sh
+
+# Train wit the large setting
+bash run_cmds/tran_pnsgd_large_coco.sh
+bash run_cmds/tran_pnsgd2_large_coco.sh
+```
+
+### Run Inference
+```
+bash run_cmds/inf_nsgd.sh
+```
+
+## Results
+
+Our models achieve the following performance.
+
+### MS-COCO
+<table>
+	<tr>
+	    <th rowspan="2">Model</th>
+	    <th colspan="3">Image-to-Text</th>
+	    <th colspan="3">Text-to-Image</th>  
+	</tr >
+	<tr>
+	    <td>R@1</td>
+	    <td>R@5</td>
+	    <td>R@110</td>
+	    <td>R@1</td>
+	    <td>R@5</td>
+	    <td>R@10</td>
+	</tr>
+	<tr>
+	    <td>NSGDC-Base</td>
+	    <td>66.6</td>
+        <td>88.6</td>
+        <td>94.0</td>
+        <td>51.6</td>
+        <td>79.1</td>
+        <td>87.5</td>
+	</tr>
+	<tr>
+	    <td>NSGDC-Large</td>
+	    <td>67.8</td>
+        <td>89.6</td>
+        <td>94.2</td>
+        <td>53.3</td>
+        <td>80.0</td>
+        <td>88.0</td>
+	</tr>
+</table>
+
+### Flickr30K
+
+
+<table>
+	<tr>
+	    <th rowspan="2">Model</th>
+	    <th colspan="3">Image-to-Text</th>
+	    <th colspan="3">Text-to-Image</th>  
+	</tr >
+	<tr>
+	    <td>R@1</td>
+	    <td>R@5</td>
+	    <td>R@110</td>
+	    <td>R@1</td>
+	    <td>R@5</td>
+	    <td>R@10</td>
+	</tr>
+	<tr>
+	    <td>NSGDC-Base</td>
+	    <td>87.9</td>
+        <td>98.1</td>
+        <td>99.3</td>
+        <td>74.5</td>
+        <td>93.3</td>
+        <td>96.3</td>
+	</tr>
+	<tr>
+	    <td>NSGDC-Large</td>
+	    <td>90.6</td>
+        <td>98.8</td>
+        <td>99.1</td>
+        <td>77.3</td>
+        <td>94.3</td>
+        <td>97.3</td>
+	</tr>
+</table>
diff --git a/config/train-itm-pnsgd-base-coco.json b/config/train-itm-pnsgd-base-coco.json
@@ -0,0 +1,43 @@
+{
+    "compressed_db": false,
+    "checkpoint": "log/pretrained/uniter-base.pt",
+    "max_txt_len": 60,
+    "conf_th": 0.2,
+    "max_bb": 100,
+    "min_bb": 10,
+    "num_bb": 36,
+    "train_batch_size": 32,
+    "negative_size": 399,
+    "hard_neg_size": 31,
+    "inf_minibatch_size": 400,
+    "margin": 0.2,
+    "valid_steps": 500,
+    "num_train_steps": 5000,
+    "optim": "adamw",
+    "betas": [
+        0.9,
+        0.98
+    ],
+    "dropout": 0.1,
+    "weight_decay": 0.01,
+    "grad_norm": 2.0,
+    "warmup_steps": 500,
+    "seed": 42,
+    "full_val": true,
+    "fp16": true,
+    "n_workers": 4,
+    "pin_mem": true,
+    "train_txt_dbs": [
+        "itm-data/txt_db3/itm_coco_train.db",
+        "itm-data/txt_db3/itm_coco_restval.db"
+    ],
+    "train_img_dbs": [
+        "itm-data/img_db/coco_train2014/",
+        "itm-data/img_db/coco_val2014/"
+    ],
+    "val_txt_db": "itm-data/txt_db3/itm_coco_val.db",
+    "val_img_db": "itm-data/img_db/coco_val2014",
+    "test_txt_db": "itm-data/txt_db3/itm_coco_test.db",
+    "test_img_db": "itm-data/img_db/coco_val2014",
+    "model_config": "config/uniter-base.json"
+}
diff --git a/config/train-itm-pnsgd-base-flickr.json b/config/train-itm-pnsgd-base-flickr.json
@@ -0,0 +1,41 @@
+{
+    "compressed_db": false,
+    "checkpoint": "log/pretrained/uniter-base.pt",
+    "max_txt_len": 60,
+    "conf_th": 0.2,
+    "max_bb": 100,
+    "min_bb": 10,
+    "num_bb": 36,
+    "train_batch_size": 32,
+    "negative_size": 399,
+    "hard_neg_size": 31,
+    "inf_minibatch_size": 400,
+    "margin": 0.2,
+    "valid_steps": 500,
+    "num_train_steps": 5000,
+    "optim": "adamw",
+    "betas": [
+        0.9,
+        0.98
+    ],
+    "dropout": 0.1,
+    "weight_decay": 0.01,
+    "grad_norm": 2.0,
+    "warmup_steps": 500,
+    "seed": 42,
+    "full_val": true,
+    "fp16": true,
+    "n_workers": 4,
+    "pin_mem": true,
+    "train_txt_dbs": [
+        "itm-data/txt_db3/itm_flickr30k_train.db"
+    ],
+    "train_img_dbs": [
+        "itm-data/img_db/flickr30k/"
+    ],
+    "val_txt_db": "itm-data/txt_db3/itm_flickr30k_val.db",
+    "val_img_db": "itm-data/img_db/flickr30k/",
+    "test_txt_db": "itm-data/txt_db3/itm_flickr30k_test.db",
+    "test_img_db": "itm-data/img_db/flickr30k/",
+    "model_config": "config/uniter-base.json"
+}
diff --git a/config/train-itm-pnsgd-large-coco.json b/config/train-itm-pnsgd-large-coco.json
@@ -0,0 +1,43 @@
+{
+    "compressed_db": false,
+    "checkpoint": "log/pretrained/uniter-large.pt",
+    "max_txt_len": 60,
+    "conf_th": 0.2,
+    "max_bb": 100,
+    "min_bb": 10,
+    "num_bb": 36,
+    "train_batch_size": 32,
+    "negative_size": 399,
+    "hard_neg_size": 31,
+    "inf_minibatch_size": 400,
+    "margin": 0.2,
+    "valid_steps": 500,
+    "num_train_steps": 5000,
+    "optim": "adamw",
+    "betas": [
+        0.9,
+        0.98
+    ],
+    "dropout": 0.1,
+    "weight_decay": 0.01,
+    "grad_norm": 2.0,
+    "warmup_steps": 500,
+    "seed": 42,
+    "full_val": true,
+    "fp16": true,
+    "n_workers": 4,
+    "pin_mem": true,
+    "train_txt_dbs": [
+        "itm-data/txt_db3/itm_coco_train.db",
+        "itm-data/txt_db3/itm_coco_restval.db"
+    ],
+    "train_img_dbs": [
+        "itm-data/img_db/coco_train2014/",
+        "itm-data/img_db/coco_val2014/"
+    ],
+    "val_txt_db": "itm-data/txt_db3/itm_coco_val.db",
+    "val_img_db": "itm-data/img_db/coco_val2014",
+    "test_txt_db": "itm-data/txt_db3/itm_coco_test.db",
+    "test_img_db": "itm-data/img_db/coco_val2014",
+    "model_config": "config/uniter-large.json"
+}
diff --git a/config/train-itm-pnsgd-large-flickr.json b/config/train-itm-pnsgd-large-flickr.json
@@ -0,0 +1,41 @@
+{
+    "compressed_db": false,
+    "checkpoint": "log/pretrained/uniter-large.pt",
+    "max_txt_len": 60,
+    "conf_th": 0.2,
+    "max_bb": 100,
+    "min_bb": 10,
+    "num_bb": 36,
+    "train_batch_size": 16,
+    "negative_size": 399,
+    "hard_neg_size": 31,
+    "inf_minibatch_size": 400,
+    "margin": 0.2,
+    "valid_steps": 500,
+    "num_train_steps": 5000,
+    "optim": "adamw",
+    "betas": [
+        0.9,
+        0.98
+    ],
+    "dropout": 0.1,
+    "weight_decay": 0.01,
+    "grad_norm": 2.0,
+    "warmup_steps": 500,
+    "seed": 42,
+    "full_val": true,
+    "fp16": true,
+    "n_workers": 4,
+    "pin_mem": true,
+    "train_txt_dbs": [
+        "itm-data/txt_db3/itm_flickr30k_train.db"
+    ],
+    "train_img_dbs": [
+        "itm-data/img_db/flickr30k/"
+    ],
+    "val_txt_db": "itm-data/txt_db3/itm_flickr30k_val.db",
+    "val_img_db": "itm-data/img_db/flickr30k/",
+    "test_txt_db": "itm-data/txt_db3/itm_flickr30k_test.db",
+    "test_img_db": "itm-data/img_db/flickr30k/",
+    "model_config": "config/uniter-large.json"
+}