diff --git a/model/fpn.py b/model/fpn.py
new file mode 100644
index 0000000..d5bdd9c
--- /dev/null
+++ b/model/fpn.py
@@ -0,0 +1,23 @@
+import torch.nn as nn
+import segmentation_models_pytorch as smp
+
+
+class FPN(nn.Module):
+    def __init__(self, num_classes):
+        super(FPN, self).__init__()
+
+        self.model = smp.FPN(
+            encoder_name='resnet50',
+            encoder_depth=5,
+            encoder_weights=None,
+            decoder_pyramid_channels=256,
+            decoder_segmentation_channels=128,
+            decoder_merge_policy='add',
+            decoder_dropout=0.,
+            in_channels=3,
+            classes=num_classes
+        )
+
+    def forward(self, x):
+        logits = self.model(x)
+        return [logits]
diff --git a/model/losses/__init__.py b/model/losses/__init__.py
new file mode 100644
index 0000000..759ea05
--- /dev/null
+++ b/model/losses/__init__.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from pytorch_toolbelt import losses as L
+
+from model.losses.pseudo_ce_loss import PseudoCrossEntropyLoss
+
+
+class LossFunction(nn.Module):
+    def __init__(self):
+        super(LossFunction, self).__init__()
+
+        self.loss_func1 = nn.CrossEntropyLoss()
+        self.loss_func2 = L.DiceLoss(mode='multiclass')
+
+    def forward(self, logits, target):
+        loss = self.loss_func1(logits[0], target) + 0.2 * self.loss_func2(logits[0], target)
+        return loss
+
+
+class SelfCorrectionLossFunction(nn.Module):
+    def __init__(self, cycle=12):
+        super(SelfCorrectionLossFunction, self).__init__()
+        self.cycle = cycle
+
+        self.sc_loss_func1 = PseudoCrossEntropyLoss()
+        self.sc_loss_func2 = L.DiceLoss(mode='multiclass')
+
+    def forward(self, predicts, target, soft_predict, cycle_n):
+        with torch.no_grad:
+            soft_predict = F.softmax(soft_predict, dim=1)
+            soft_predict = self.weighted(self.to_one_hot(target, soft_predict.size(1)), soft_predict,
+                                         alpha=1. / (cycle_n + 1))
+        loss1 = self.sc_loss_func1(predicts[0], soft_predict)
+        loss2 = self.sc_loss_func2(predicts, target)
+        return loss1 + 0.2 * loss2
+
+    @staticmethod
+    def weighted(target_one_hot, soft_predict, alpha):
+        soft_predict = alpha * target_one_hot + (1 - alpha) * soft_predict
+        return soft_predict
+
+    @staticmethod
+    def to_one_hot(tensor, num_cls, dim=1, ignore_index=255):
+        b, h, w = tensor.shape
+        tensor[tensor == ignore_index] = 0
+        onehot_tensor = torch.zeros(b, num_cls, h, w).cuda()
+        onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1)
+        return onehot_tensor
diff --git a/model/losses/__pycache__/__init__.cpython-36.pyc b/model/losses/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..689b3ec
Binary files /dev/null and b/model/losses/__pycache__/__init__.cpython-36.pyc differ
diff --git a/model/losses/__pycache__/pseudo_ce_loss.cpython-36.pyc b/model/losses/__pycache__/pseudo_ce_loss.cpython-36.pyc
new file mode 100644
index 0000000..a7d0505
Binary files /dev/null and b/model/losses/__pycache__/pseudo_ce_loss.cpython-36.pyc differ
diff --git a/model/losses/pseudo_ce_loss.py b/model/losses/pseudo_ce_loss.py
new file mode 100644
index 0000000..4bc8ac9
--- /dev/null
+++ b/model/losses/pseudo_ce_loss.py
@@ -0,0 +1,16 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+
+
+class PseudoCrossEntropyLoss(nn.Module):
+    def __init__(self, dim=1):
+        super(PseudoCrossEntropyLoss, self).__init__()
+        self.dim = dim
+
+    def forward(self, input: Tensor, target: Tensor):
+        input_log_prob = F.log_softmax(input, dim=self.dim)
+        loss = torch.sum(-input_log_prob * target, dim=self.dim)
+        return loss.mean()
diff --git a/model/optim/__init__.py b/model/optim/__init__.py
new file mode 100644
index 0000000..50eea8f
--- /dev/null
+++ b/model/optim/__init__.py
@@ -0,0 +1,78 @@
+import torch.optim as optim
+
+from .radam import RAdam
+from .lookahead import Lookahead
+from .cyclicLR import CyclicCosAnnealingLR
+from .warmup_scheduler import GradualWarmupScheduler
+
+
+def get_optimizer(params, optimizer_cfg):
+    if optimizer_cfg['mode'] == 'SGD':
+        optimizer = optim.SGD(params, lr=optimizer_cfg['lr'], momentum=0.9,
+                              weight_decay=optimizer_cfg['weight_decay'], nesterov=optimizer_cfg['nesterov'])
+    elif optimizer_cfg['mode'] == 'RAdam':
+        optimizer = RAdam(params, lr=optimizer_cfg['lr'], betas=(0.9, 0.999),
+                          weight_decay=optimizer_cfg['weight_decay'])
+    else:
+        optimizer = optim.Adam(params, lr=optimizer_cfg['lr'], betas=(0.9, 0.999),
+                               weight_decay=optimizer_cfg['weight_decay'])
+
+    if optimizer_cfg['lookahead']:
+        optimizer = Lookahead(optimizer, k=5, alpha=0.5)
+
+    # todo: add split_weights.py
+
+    return optimizer
+
+
+def get_scheduler(optimizer, scheduler_cfg):
+    MODE = scheduler_cfg['mode']
+
+    if MODE == 'OneCycleLR':
+        scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
+                                                  max_lr=optimizer.param_groups[0]['lr'],
+                                                  total_steps=scheduler_cfg['steps'],
+                                                  pct_start=scheduler_cfg['pct_start'],
+                                                  final_div_factor=scheduler_cfg['final_div_factor'],
+                                                  cycle_momentum=scheduler_cfg['cycle_momentum'],
+                                                  anneal_strategy=scheduler_cfg['anneal_strategy'])
+
+    elif MODE == 'PolyLR':
+        lr_lambda = lambda step: (1 - step / scheduler_cfg['steps']) ** scheduler_cfg['power']
+        scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+
+    elif MODE == 'CosineAnnealingLR':
+        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=scheduler_cfg['steps'],
+                                                         eta_min=scheduler_cfg['eta_min'])
+
+    elif MODE == 'MultiStepLR':
+        scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
+                                                   scheduler_cfg['milestones'],
+                                                   gamma=scheduler_cfg['gamma'])
+
+    elif MODE == 'CosineAnnealingWarmRestarts':
+        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
+                                                                   T_0=scheduler_cfg['T_0'],
+                                                                   T_mult=scheduler_cfg['T_multi'],
+                                                                   eta_min=scheduler_cfg['eta_min'])
+
+    elif MODE == 'CyclicCosAnnealingLR':
+        scheduler = CyclicCosAnnealingLR(optimizer,
+                                         milestones=scheduler_cfg['milestones'],
+                                         decay_milestones=scheduler_cfg['decay_milestones'],
+                                         eta_min=scheduler_cfg['eta_min'],
+                                         gamma=scheduler_cfg['gamma'])
+
+    elif scheduler_cfg.MODE == 'GradualWarmupScheduler':
+        milestones = list(map(lambda x: x - scheduler_cfg['warmup_steps'], scheduler_cfg['milestones']))
+        scheduler_steplr = optim.lr_scheduler.MultiStepLR(optimizer,
+                                                          milestones=milestones,
+                                                          gamma=scheduler_cfg['gamma'])
+        scheduler = GradualWarmupScheduler(optimizer,
+                                           multiplier=scheduler_cfg['milestones'],
+                                           total_epoch=scheduler_cfg['warmup_steps'],
+                                           after_scheduler=scheduler_steplr)
+    else:
+        raise ValueError
+
+    return scheduler
diff --git a/model/optim/__pycache__/__init__.cpython-36.pyc b/model/optim/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..63f4328
Binary files /dev/null and b/model/optim/__pycache__/__init__.cpython-36.pyc differ
diff --git a/model/optim/__pycache__/cyclicLR.cpython-36.pyc b/model/optim/__pycache__/cyclicLR.cpython-36.pyc
new file mode 100644
index 0000000..9ba3d09
Binary files /dev/null and b/model/optim/__pycache__/cyclicLR.cpython-36.pyc differ
diff --git a/model/optim/__pycache__/lookahead.cpython-36.pyc b/model/optim/__pycache__/lookahead.cpython-36.pyc
new file mode 100644
index 0000000..9c0bb0e
Binary files /dev/null and b/model/optim/__pycache__/lookahead.cpython-36.pyc differ
diff --git a/model/optim/__pycache__/radam.cpython-36.pyc b/model/optim/__pycache__/radam.cpython-36.pyc
new file mode 100644
index 0000000..b81b2c1
Binary files /dev/null and b/model/optim/__pycache__/radam.cpython-36.pyc differ
diff --git a/model/optim/__pycache__/warmup_scheduler.cpython-36.pyc b/model/optim/__pycache__/warmup_scheduler.cpython-36.pyc
new file mode 100644
index 0000000..3f03a30
Binary files /dev/null and b/model/optim/__pycache__/warmup_scheduler.cpython-36.pyc differ
diff --git a/model/optim/cyclicLR.py b/model/optim/cyclicLR.py
new file mode 100644
index 0000000..fa38311
--- /dev/null
+++ b/model/optim/cyclicLR.py
@@ -0,0 +1,125 @@
+import math
+from bisect import bisect_right
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+
+
+class CyclicCosAnnealingLR(_LRScheduler):
+    r"""
+
+    Implements reset on milestones inspired from CosineAnnealingLR pytorch
+
+    Set the learning rate of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial lr and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
+        \cos(\frac{T_{cur}}{T_{max}}\pi))
+    When last_epoch > last set milestone, lr is automatically set to \eta_{min}
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
+    implements the cosine annealing part of SGDR, and not the restarts.
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        milestones (list of ints): List of epoch indices. Must be increasing.
+        decay_milestones(list of ints):List of increasing epoch indices. Ideally,decay values should overlap with milestone points
+        gamma (float): factor by which to decay the max learning rate at each decay milestone
+        eta_min (float): Minimum learning rate. Default: 1e-6
+        last_epoch (int): The index of last epoch. Default: -1.
+
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self, optimizer, milestones, decay_milestones=None, gamma=0.5, eta_min=1e-6, last_epoch=-1):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError('Milestones should be a list of'
+                             ' increasing integers. Got {}', milestones)
+        self.eta_min = eta_min
+        self.milestones = milestones
+        self.milestones2 = decay_milestones
+
+        self.gamma = gamma
+        super(CyclicCosAnnealingLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+
+        if self.last_epoch >= self.milestones[-1]:
+            return [self.eta_min for base_lr in self.base_lrs]
+
+        idx = bisect_right(self.milestones, self.last_epoch)
+
+        left_barrier = 0 if idx == 0 else self.milestones[idx - 1]
+        right_barrier = self.milestones[idx]
+
+        width = right_barrier - left_barrier
+        curr_pos = self.last_epoch - left_barrier
+
+        if self.milestones2:
+            return [self.eta_min + (
+                        base_lr * self.gamma ** bisect_right(self.milestones2, self.last_epoch) - self.eta_min) *
+                    (1 + math.cos(math.pi * curr_pos / width)) / 2
+                    for base_lr in self.base_lrs]
+        else:
+            return [self.eta_min + (base_lr - self.eta_min) *
+                    (1 + math.cos(math.pi * curr_pos / width)) / 2
+                    for base_lr in self.base_lrs]
+
+
+class CyclicLinearLR(_LRScheduler):
+    r"""
+    Implements reset on milestones inspired from Linear learning rate decay
+
+    Set the learning rate of each parameter group using a linear decay
+    schedule, where :math:`\eta_{max}` is set to the initial lr and
+    :math:`T_{cur}` is the number of epochs since the last restart:
+    .. math::
+        \eta_t = \eta_{min} + (\eta_{max} - \eta_{min})(1 -\frac{T_{cur}}{T_{max}})
+    When last_epoch > last set milestone, lr is automatically set to \eta_{min}
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        milestones (list of ints): List of epoch indices. Must be increasing.
+        decay_milestones(list of ints):List of increasing epoch indices. Ideally,decay values should overlap with milestone points
+        gamma (float): factor by which to decay the max learning rate at each decay milestone
+        eta_min (float): Minimum learning rate. Default: 1e-6
+        last_epoch (int): The index of last epoch. Default: -1.
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self, optimizer, milestones, decay_milestones=None, gamma=0.5, eta_min=1e-6, last_epoch=-1):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError('Milestones should be a list of'
+                             ' increasing integers. Got {}', milestones)
+        self.eta_min = eta_min
+
+        self.gamma = gamma
+        self.milestones = milestones
+        self.milestones2 = decay_milestones
+        super(CyclicLinearLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+
+        if self.last_epoch >= self.milestones[-1]:
+            return [self.eta_min for base_lr in self.base_lrs]
+
+        idx = bisect_right(self.milestones, self.last_epoch)
+
+        left_barrier = 0 if idx == 0 else self.milestones[idx - 1]
+        right_barrier = self.milestones[idx]
+
+        width = right_barrier - left_barrier
+        curr_pos = self.last_epoch - left_barrier
+
+        if self.milestones2:
+            return [self.eta_min + (
+                        base_lr * self.gamma ** bisect_right(self.milestones2, self.last_epoch) - self.eta_min) *
+                    (1. - 1.0 * curr_pos / width)
+                    for base_lr in self.base_lrs]
+
+        else:
+            return [self.eta_min + (base_lr - self.eta_min) *
+                    (1. - 1.0 * curr_pos / width)
+                    for base_lr in self.base_lrs]
\ No newline at end of file
diff --git a/model/optim/lookahead.py b/model/optim/lookahead.py
new file mode 100644
index 0000000..378d874
--- /dev/null
+++ b/model/optim/lookahead.py
@@ -0,0 +1,100 @@
+import torch
+from torch.optim import Optimizer
+from collections import defaultdict
+
+
+class Lookahead(Optimizer):
+    '''
+    PyTorch implementation of the lookahead wrapper.
+    Lookahead Optimizer: https://arxiv.org/abs/1907.08610
+    '''
+
+    def __init__(self, optimizer, alpha=0.5, k=6, pullback_momentum="none"):
+        '''
+        :param optimizer:inner optimizer
+        :param k (int): number of lookahead steps
+        :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer.
+        :param pullback_momentum (str): change to inner optimizer momentum on interpolation update
+        '''
+        if not 0.0 <= alpha <= 1.0:
+            raise ValueError(f'Invalid slow update rate: {alpha}')
+        if not 1 <= k:
+            raise ValueError(f'Invalid lookahead steps: {k}')
+        self.optimizer = optimizer
+        self.param_groups = self.optimizer.param_groups
+        self.alpha = alpha
+        self.k = k
+        self.step_counter = 0
+        assert pullback_momentum in ["reset", "pullback", "none"]
+        self.pullback_momentum = pullback_momentum
+        self.state = defaultdict(dict)
+
+        # Cache the current optimizer parameters
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                param_state = self.state[p]
+                param_state['cached_params'] = torch.zeros_like(p.data)
+                param_state['cached_params'].copy_(p.data)
+
+    def __getstate__(self):
+        return {
+            'state': self.state,
+            'optimizer': self.optimizer,
+            'alpha': self.alpha,
+            'step_counter': self.step_counter,
+            'k': self.k,
+            'pullback_momentum': self.pullback_momentum
+        }
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+    def state_dict(self):
+        return self.optimizer.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self.optimizer.load_state_dict(state_dict)
+
+    def _backup_and_load_cache(self):
+        """Useful for performing evaluation on the slow weights (which typically generalize better)
+        """
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                param_state = self.state[p]
+                param_state['backup_params'] = torch.zeros_like(p.data)
+                param_state['backup_params'].copy_(p.data)
+                p.data.copy_(param_state['cached_params'])
+
+    def _clear_and_load_backup(self):
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                param_state = self.state[p]
+                p.data.copy_(param_state['backup_params'])
+                del param_state['backup_params']
+
+    def step(self, closure=None):
+        """Performs a single Lookahead optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = self.optimizer.step(closure)
+        self.step_counter += 1
+
+        if self.step_counter >= self.k:
+            self.step_counter = 0
+            # Lookahead and cache the current optimizer parameters
+            for group in self.optimizer.param_groups:
+                for p in group['params']:
+                    param_state = self.state[p]
+                    p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params'])  # crucial line
+                    param_state['cached_params'].copy_(p.data)
+                    if self.pullback_momentum == "pullback":
+                        internal_momentum = self.optimizer.state[p]["momentum_buffer"]
+                        self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_(
+                            1.0 - self.alpha, param_state["cached_mom"])
+                        param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"]
+                    elif self.pullback_momentum == "reset":
+                        self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data)
+
+        return loss
diff --git a/model/optim/radam.py b/model/optim/radam.py
new file mode 100644
index 0000000..f439c04
--- /dev/null
+++ b/model/optim/radam.py
@@ -0,0 +1,250 @@
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+
+
+class RAdam(Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        self.degenerated_to_sgd = degenerated_to_sgd
+        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+            for param in params:
+                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
+                    param['buffer'] = [[None, None, None] for _ in range(10)]
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        buffer=[[None, None, None] for _ in range(10)])
+        super(RAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RAdam, self).__setstate__(state)
+
+    def step(self, closure=None):
+
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('RAdam does not support sparse gradients')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                state['step'] += 1
+                buffered = group['buffer'][int(state['step'] % 10)]
+                if state['step'] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state['step']
+                    beta2_t = beta2 ** state['step']
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+
+                    # more conservative since it's an approximated value
+                    if N_sma >= 5:
+                        step_size = math.sqrt(
+                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
+                                    N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                    elif self.degenerated_to_sgd:
+                        step_size = 1.0 / (1 - beta1 ** state['step'])
+                    else:
+                        step_size = -1
+                    buffered[2] = step_size
+
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
+                    p.data.copy_(p_data_fp32)
+                elif step_size > 0:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
+                    p.data.copy_(p_data_fp32)
+
+        return loss
+
+
+class PlainRAdam(Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        self.degenerated_to_sgd = degenerated_to_sgd
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+
+        super(PlainRAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(PlainRAdam, self).__setstate__(state)
+
+    def step(self, closure=None):
+
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('RAdam does not support sparse gradients')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                state['step'] += 1
+                beta2_t = beta2 ** state['step']
+                N_sma_max = 2 / (1 - beta2) - 1
+                N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    step_size = group['lr'] * math.sqrt(
+                        (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
+                                N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
+                    p.data.copy_(p_data_fp32)
+                elif self.degenerated_to_sgd:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    step_size = group['lr'] / (1 - beta1 ** state['step'])
+                    p_data_fp32.add_(-step_size, exp_avg)
+                    p.data.copy_(p_data_fp32)
+
+        return loss
+
+
+class AdamW(Optimizer):
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, warmup=warmup)
+        super(AdamW, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                if group['warmup'] > state['step']:
+                    scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
+                else:
+                    scheduled_lr = group['lr']
+
+                step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1
+
+                if group['weight_decay'] != 0:
+                    p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
+
+                p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
+
+                p.data.copy_(p_data_fp32)
+
+        return loss
\ No newline at end of file
diff --git a/model/optim/warmup_scheduler.py b/model/optim/warmup_scheduler.py
new file mode 100644
index 0000000..1a8d7eb
--- /dev/null
+++ b/model/optim/warmup_scheduler.py
@@ -0,0 +1,65 @@
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+
+
+class GradualWarmupScheduler(_LRScheduler):
+    """ Gradually warm-up(increasing) learning rate in optimizer.
+    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
+        total_epoch: target learning rate is reached at total_epoch, gradually
+        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    """
+
+    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
+        self.multiplier = multiplier
+        if self.multiplier < 1.:
+            raise ValueError('multiplier should be greater thant or equal to 1.')
+        self.total_epoch = total_epoch
+        self.after_scheduler = after_scheduler
+        self.finished = False
+        super(GradualWarmupScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        if self.last_epoch > self.total_epoch:
+            if self.after_scheduler:
+                if not self.finished:
+                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
+                    self.finished = True
+                return self.after_scheduler.get_last_lr()
+            return [base_lr * self.multiplier for base_lr in self.base_lrs]
+
+        if self.multiplier == 1.0:
+            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
+        else:
+            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in
+                    self.base_lrs]
+
+    def step_ReduceLROnPlateau(self, metrics, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
+        if self.last_epoch <= self.total_epoch:
+            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in
+                         self.base_lrs]
+            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
+                param_group['lr'] = lr
+        else:
+            if epoch is None:
+                self.after_scheduler.step(metrics, None)
+            else:
+                self.after_scheduler.step(metrics, epoch - self.total_epoch)
+
+    def step(self, epoch=None, metrics=None):
+        if type(self.after_scheduler) != ReduceLROnPlateau:
+            if self.finished and self.after_scheduler:
+                if epoch is None:
+                    self.after_scheduler.step(None)
+                else:
+                    self.after_scheduler.step(epoch - self.total_epoch)
+                self._last_lr = self.after_scheduler.get_last_lr()
+            else:
+                return super(GradualWarmupScheduler, self).step(epoch)
+        else:
+            self.step_ReduceLROnPlateau(metrics, epoch)
diff --git a/model/tools/Balanced_DataParallel.py b/model/tools/Balanced_DataParallel.py
new file mode 100644
index 0000000..bcedc98
--- /dev/null
+++ b/model/tools/Balanced_DataParallel.py
@@ -0,0 +1,112 @@
+import torch
+
+from torch.nn.parallel import DataParallel
+from torch.nn.parallel._functions import Scatter
+from torch.nn.parallel.parallel_apply import parallel_apply
+
+
+def scatter(inputs, target_gpus, chunk_sizes, dim=0):
+    r"""
+    Slices tensors into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not tensors.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            try:
+                return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
+            except:
+                print('obj', obj.size())
+                print('dim', dim)
+                print('chunk_sizes', chunk_sizes)
+                quit()
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict) and len(obj) > 0:
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
+
+
+class BalancedDataParallel(DataParallel):
+    def __init__(self, gpu0_bsz, *args, **kwargs):
+        self.gpu0_bsz = gpu0_bsz
+        super().__init__(*args, **kwargs)
+
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        if self.gpu0_bsz == 0:
+            device_ids = self.device_ids[1:]
+        else:
+            device_ids = self.device_ids
+        inputs, kwargs = self.scatter(inputs, kwargs, device_ids)
+
+        # print('len(inputs): ', str(len(inputs)))
+        # print('self.device_ids[:len(inputs)]', str(self.device_ids[:len(inputs)]))
+
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        if self.gpu0_bsz == 0:
+            replicas = self.replicate(self.module, self.device_ids)
+        else:
+            replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+
+        # replicas = self.replicate(self.module, device_ids[:len(inputs)])
+        if self.gpu0_bsz == 0:
+            replicas = replicas[1:]
+
+        # print('replicas:', str(len(replicas)))
+
+        outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def parallel_apply(self, replicas, device_ids, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, device_ids[:len(inputs)])
+
+    def scatter(self, inputs, kwargs, device_ids):
+        bsz = inputs[0].size(self.dim)
+        num_dev = len(self.device_ids)
+        gpu0_bsz = self.gpu0_bsz
+        bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1)
+        if gpu0_bsz < bsz_unit:
+            chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1)
+            delta = bsz - sum(chunk_sizes)
+            for i in range(delta):
+                chunk_sizes[i + 1] += 1
+            if gpu0_bsz == 0:
+                chunk_sizes = chunk_sizes[1:]
+        else:
+            return super().scatter(inputs, kwargs, device_ids)
+
+        print('bsz: ', bsz)
+        print('num_dev: ', num_dev)
+        print('gpu0_bsz: ', gpu0_bsz)
+        print('bsz_unit: ', bsz_unit)
+        print('chunk_sizes: ', chunk_sizes)
+        return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim)
diff --git a/model/tools/__init__.py b/model/tools/__init__.py
new file mode 100644
index 0000000..af87f13
--- /dev/null
+++ b/model/tools/__init__.py
@@ -0,0 +1,2 @@
+from .Balanced_DataParallel import BalancedDataParallel
+from .split_weights import split_weights
diff --git a/model/tools/__pycache__/Balanced_DataParallel.cpython-36.pyc b/model/tools/__pycache__/Balanced_DataParallel.cpython-36.pyc
new file mode 100644
index 0000000..b89543b
Binary files /dev/null and b/model/tools/__pycache__/Balanced_DataParallel.cpython-36.pyc differ
diff --git a/model/tools/__pycache__/__init__.cpython-36.pyc b/model/tools/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..ee0e836
Binary files /dev/null and b/model/tools/__pycache__/__init__.cpython-36.pyc differ
diff --git a/model/tools/__pycache__/metric.cpython-36.pyc b/model/tools/__pycache__/metric.cpython-36.pyc
new file mode 100644
index 0000000..b94a4f7
Binary files /dev/null and b/model/tools/__pycache__/metric.cpython-36.pyc differ
diff --git a/model/tools/__pycache__/split_weights.cpython-36.pyc b/model/tools/__pycache__/split_weights.cpython-36.pyc
new file mode 100644
index 0000000..618b3dc
Binary files /dev/null and b/model/tools/__pycache__/split_weights.cpython-36.pyc differ
diff --git a/model/tools/metric.py b/model/tools/metric.py
new file mode 100644
index 0000000..05f2238
--- /dev/null
+++ b/model/tools/metric.py
@@ -0,0 +1,75 @@
+import numpy as np
+
+
+class SegmentationMetric:
+    def __init__(self, num_classes):
+        self.num_classes = num_classes
+        self.confusionMatrix = np.zeros((self.num_classes, self.num_classes))
+
+    def genConfusionMatrix(self, predict, label):
+        """
+        row: Ture label
+        column: Predict result
+        """
+        # remove classes from unlabeled pixels in gt image and predict
+        mask = (label >= 0) & (label < self.num_classes)
+        label = self.num_classes * label[mask] + predict[mask]
+        count = np.bincount(label, minlength=self.num_classes ** 2)
+        confusionMatrix = count.reshape(self.num_classes, self.num_classes)
+        return confusionMatrix
+
+    def pixelAccuracy(self):
+        """
+        return all class overall pixel accuracy
+        PA = acc = (TP + TN) / (TP + TN + FP + TN)
+        """
+        Acc = np.diag(self.confusionMatrix).sum() / self.confusionMatrix.sum()
+        return Acc
+
+    def classPixelAccuracy(self):
+        """
+        return each category pixel accuracy(A more accurate way to call it precision)
+        Acc = (TP) / TP + FP
+        返回的是一个列表值，如：[0.90, 0.80, 0.96]，表示类别1 2 3各类别的预测准确率
+        """
+        classAcc = np.diag(self.confusionMatrix) / self.confusionMatrix.sum(axis=1)
+        return classAcc
+
+    def meanPixelAccuracy(self):
+        """
+        返回单个值，如：np.nanmean([0.90, 0.80, 0.96, nan, nan]) = (0.90 + 0.80 + 0.96） / 3 =  0.89
+        """
+        classAcc = self.classPixelAccuracy()
+        meanAcc = np.nanmean(classAcc)
+        return meanAcc
+
+    def meanIntersectionOverUnion(self):
+        """
+        Intersection = TP
+        Union = TP + FP + FN
+        IoU = TP / (TP + FP + FN)
+        """
+        intersection = np.diag(self.confusionMatrix)  # 取对角元素的值，返回列表
+        union = np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) - np.diag(
+            self.confusionMatrix)  # axis = 1表示混淆矩阵行的值，返回列表； axis = 0表示取混淆矩阵列的值，返回列表
+        IoU = intersection / union  # 返回列表，其值为各个类别的IoU
+        mIoU = np.nanmean(IoU)  # 求各类别IoU的平均
+        return IoU, mIoU
+
+    def FrequencyWeightedIntersectionOverUnion(self):
+        """
+        FWIOU =     [(TP+FN)/(TP+FP+TN+FN)] *[TP / (TP + FP + FN)]
+        """
+        freq = np.sum(self.confusionMatrix, axis=1) / np.sum(self.confusionMatrix)
+        iu = np.diag(self.confusionMatrix) / (
+                np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) -
+                np.diag(self.confusionMatrix))
+        FWIoU = (freq[freq > 0] * iu[freq > 0]).sum()
+        return FWIoU
+
+    def addBatch(self, predict, label):
+        assert predict.shape == label.shape
+        self.confusionMatrix += self.genConfusionMatrix(predict, label)
+
+    def reset(self):
+        self.confusionMatrix = np.zeros((self.num_classes, self.num_classes))
diff --git a/model/tools/split_weights.py b/model/tools/split_weights.py
new file mode 100644
index 0000000..dc759a3
--- /dev/null
+++ b/model/tools/split_weights.py
@@ -0,0 +1,34 @@
+import torch.nn as nn
+
+
+def split_weights(net):
+    """split network weights into to categlories,
+    one are weights in conv layer and linear layer,
+    others are other learnable paramters(conv bias,
+    bn weights, bn bias, linear bias)
+    Args:
+        net: network architecture
+
+    Returns:
+        a dictionary of params splite into to categlories
+    """
+
+    decay = []
+    no_decay = []
+
+    for m in net.modules():
+        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+            decay.append(m.weight)
+
+            if m.bias is not None:
+                no_decay.append(m.bias)
+
+        else:
+            if hasattr(m, 'weight'):
+                no_decay.append(m.weight)
+            if hasattr(m, 'bias'):
+                no_decay.append(m.bias)
+
+    assert len(list(net.parameters())) == len(decay) + len(no_decay)
+
+    return [dict(params=decay), dict(params=no_decay, weight_decay=0)]
diff --git a/model/unet.py b/model/unet.py
new file mode 100644
index 0000000..7bc7ea1
--- /dev/null
+++ b/model/unet.py
@@ -0,0 +1,22 @@
+import torch.nn as nn
+import segmentation_models_pytorch as smp
+
+
+class Unet(nn.Module):
+    def __init__(self, num_classes):
+        super(Unet, self).__init__()
+
+        self.model = smp.Unet(
+            encoder_name="se_resnext50_32x4d",
+            encoder_depth=5,
+            encoder_weights='imagenet',
+            decoder_use_batchnorm=True,
+            decoder_channels=[256, 128, 64, 32, 16],
+            decoder_attention_type='scse',
+            in_channels=3,
+            classes=num_classes,
+        )
+
+    def forward(self, x):
+        logits = self.model(x)
+        return [logits]