sail-sg
diff --git a/‎README.md
Lines changed: 15 additions & 11 deletions b/‎README.md
Lines changed: 15 additions & 11 deletions
diff --git a/‎infer_mdt.py
Lines changed: 7 additions & 7 deletions b/‎infer_mdt.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎masked_diffusion/gaussian_diffusion.py
Lines changed: 23 additions & 1 deletion b/‎masked_diffusion/gaussian_diffusion.py
Lines changed: 23 additions & 1 deletion
@@ -1,10 +1,13 @@
-# Masked Diffusion Transformer
+# Masked Diffusion Transformer V2
 
 [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/masked-diffusion-transformer-is-a-strong/image-generation-on-imagenet-256x256)](https://paperswithcode.com/sota/image-generation-on-imagenet-256x256?p=masked-diffusion-transformer-is-a-strong)
 [![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/shgao/MDT)
 
 The official codebase for [Masked Diffusion Transformer is a Strong Image Synthesizer](https://arxiv.org/abs/2303.14389).
 
+## MDTv2: Faster Convergeence & Stronger performance
+**MDTv2 demonstrates new SOTA (State of the Art) performance and a 5x acceleration compared to the original MDT.**
+
 ## Introduction
 
 Despite its success in image synthesis, we observe that diffusion probabilistic models (DPMs) often lack contextual reasoning ability to learn the relations among object parts in an image, leading to a slow learning process. 
@@ -20,6 +23,7 @@ Experimental results show that MDT achieves superior image synthesis performance
 | Model| Dataset |  Resolution | FID-50K | Inception Score |
 |---------|----------|-----------|---------|--------|
 |MDT-XL/2 | ImageNet | 256x256   | 1.79    | 283.01|
+|MDTv2-XL/2 | ImageNet | 256x256 | 1.58    | 314.73|
 
 [Pretrained model download](https://huggingface.co/shgao/MDT-XL2/tree/main)
 
@@ -53,10 +57,10 @@ as the [ADM's dataloder](https://github.com/openai/guided-diffusion) gets the cl
   <summary>Training on one node (`run.sh`). </summary>
 
 ```shell
-export OPENAI_LOGDIR=output_mdt_s2
+export OPENAI_LOGDIR=output_mdtv2_s2
 NUM_GPUS=8
 
-MODEL_FLAGS="--image_size 256 --mask_ratio 0.30 --decode_layer 2 --model MDT_S_2"
+MODEL_FLAGS="--image_size 256 --mask_ratio 0.30 --decode_layer 4 --model MDTv2_S_2"
 DIFFUSION_FLAGS="--diffusion_steps 1000"
 TRAIN_FLAGS="--batch_size 32"
 DATA_PATH=/dataset/imagenet
@@ -71,8 +75,8 @@ python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS scripts/image_trai
 
 ```shell
 # On master:
-export OPENAI_LOGDIR=output_mdt_xl2
-MODEL_FLAGS="--image_size 256 --mask_ratio 0.30 --decode_layer 2 --model MDT_XL_2"
+export OPENAI_LOGDIR=output_mdtv2_xl2
+MODEL_FLAGS="--image_size 256 --mask_ratio 0.30 --decode_layer 2 --model MDTv2_XL_2"
 DIFFUSION_FLAGS="--diffusion_steps 1000"
 TRAIN_FLAGS="--batch_size 4"
 DATA_PATH=/dataset/imagenet
@@ -82,8 +86,8 @@ GPU_PRE_NODE=8
 python -m torch.distributed.launch --master_addr=$(hostname) --nnodes=$NUM_NODE --node_rank=$RANK --nproc_per_node=$GPU_PRE_NODE --master_port=$MASTER_PORT scripts/image_train.py --data_dir $DATA_PATH $MODEL_FLAGS $DIFFUSION_FLAGS $TRAIN_FLAGS
 
 # On workers:
-export OPENAI_LOGDIR=output_mdt_xl2
-MODEL_FLAGS="--image_size 256 --mask_ratio 0.30 --decode_layer 2 --model MDT_XL_2"
+export OPENAI_LOGDIR=output_mdtv2_xl2
+MODEL_FLAGS="--image_size 256 --mask_ratio 0.30 --decode_layer 2 --model MDTv2_XL_2"
 DIFFUSION_FLAGS="--diffusion_steps 1000"
 TRAIN_FLAGS="--batch_size 4"
 DATA_PATH=/dataset/imagenet
@@ -106,12 +110,12 @@ Please follow the instructions in the `evaluations` folder to set up the evaluat
   <summary>Sampling and Evaluation (`run_sample.sh`): </summary>
 
 ```shell
-MODEL_PATH=output_mdt_xl2/mdt_xl2_v1_ckpt.pt
-export OPENAI_LOGDIR=output_mdt_xl2_eval
+MODEL_PATH=output_mdtv2_xl2/mdt_xl2_v2_ckpt.pt
+export OPENAI_LOGDIR=output_mdtv2_xl2_eval
 NUM_GPUS=8
 
 echo 'CFG Class-conditional sampling:'
-MODEL_FLAGS="--image_size 256 --model MDT_XL_2 --decode_layer 2"
+MODEL_FLAGS="--image_size 256 --model MDTv2_XL_2 --decode_layer 4"
 DIFFUSION_FLAGS="--num_sampling_steps 250 --num_samples 50000  --cfg_cond True"
 echo $MODEL_FLAGS
 echo $DIFFUSION_FLAGS
@@ -123,7 +127,7 @@ echo $MODEL_PATH
 python evaluations/evaluator.py ../dataeval/VIRTUAL_imagenet256_labeled.npz $OPENAI_LOGDIR/samples_50000x256x256x3.npz
 
 echo 'Class-conditional sampling:'
-MODEL_FLAGS="--image_size 256 --model MDT_XL_2 --decode_layer 2"
+MODEL_FLAGS="--image_size 256 --model MDTv2_XL_2 --decode_layer 4"
 DIFFUSION_FLAGS="--num_sampling_steps 250 --num_samples 50000"
 echo $MODEL_FLAGS
 echo $DIFFUSION_FLAGS
 
@@ -8,23 +8,23 @@
 from torchvision.utils import save_image
 from masked_diffusion import create_diffusion
 from diffusers.models import AutoencoderKL
-from masked_diffusion.models import MDT_XL_2
+from masked_diffusion.models import MDTv2_XL_2
 
 
 # Setup PyTorch:
-torch.manual_seed(0)
+torch.manual_seed(1)
 torch.set_grad_enabled(False)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-num_sampling_steps = 500
-cfg_scale = 5.0
+num_sampling_steps = 250
+cfg_scale = 4.0
 pow_scale = 0.01 # large pow_scale increase the diversity, small pow_scale increase the quality.
-model_path = 'mdt_xl2_v1_ckpt.pt'
+model_path = 'mdt_xl2_v2_ckpt.pt'
 
 # Load model:
 image_size = 256
 assert image_size in [256], "We provide pre-trained models for 256x256 resolutions for now."
 latent_size = image_size // 8
-model = MDT_XL_2(input_size=latent_size, decode_layer=2).to(device)
+model = MDTv2_XL_2(input_size=latent_size, decode_layer=2).to(device)
 
 state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
 model.load_state_dict(state_dict)
@@ -33,7 +33,7 @@
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(device)
 
 # Labels to condition the model with:
-class_labels = [208]*3
+class_labels = [19,23,106,108,278,282]
 
 # Create sampling noise:
 n = len(class_labels)
 
@@ -28,6 +28,7 @@ class ModelMeanType(enum.Enum):
     PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
     START_X = enum.auto()  # the model predicts x_0
     EPSILON = enum.auto()  # the model predicts epsilon
+    VELOCITY = enum.auto() # the model predicts v
 
 
 class ModelVarType(enum.Enum):
@@ -732,6 +733,26 @@ def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
 
         terms = {}
 
+
+        mse_loss_weight = None
+        alpha = _extract_into_tensor(self.sqrt_alphas_cumprod, t, t.shape)
+        sigma = _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, t.shape)
+        snr = (alpha / sigma) ** 2
+        
+        velocity = (alpha[:, None, None, None] * x_t - x_start) / sigma[:, None, None, None]
+
+        # get loss weight
+        if self.model_mean_type is not ModelMeanType.START_X:
+            mse_loss_weight = th.ones_like(t)
+            k = 5.0
+            # min{snr, k}
+            mse_loss_weight = th.stack([snr, k * th.ones_like(t)], dim=1).min(dim=1)[0] / snr
+        else:
+            k = 5.0
+            # min{snr, k}
+            mse_loss_weight = th.stack([snr, k * th.ones_like(t)], dim=1).min(dim=1)[0]
+
+            
         if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
             terms["loss"] = self._vb_terms_bpd(
                 model=model,
@@ -774,9 +795,10 @@ def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
                 )[0],
                 ModelMeanType.START_X: x_start,
                 ModelMeanType.EPSILON: noise,
+                ModelMeanType.VELOCITY: velocity,
             }[self.model_mean_type]
             assert model_output.shape == target.shape == x_start.shape
-            terms["mse"] = mean_flat((target - model_output) ** 2)
+            terms["mse"] = mse_loss_weight * mean_flat((target - model_output) ** 2)
             if "vb" in terms:
                 terms["loss"] = terms["mse"] + terms["vb"]
             else: