diff --git a/model/fpn.py b/model/fpn.py new file mode 100644 index 0000000..d5bdd9c --- /dev/null +++ b/model/fpn.py @@ -0,0 +1,23 @@ +import torch.nn as nn +import segmentation_models_pytorch as smp + + +class FPN(nn.Module): + def __init__(self, num_classes): + super(FPN, self).__init__() + + self.model = smp.FPN( + encoder_name='resnet50', + encoder_depth=5, + encoder_weights=None, + decoder_pyramid_channels=256, + decoder_segmentation_channels=128, + decoder_merge_policy='add', + decoder_dropout=0., + in_channels=3, + classes=num_classes + ) + + def forward(self, x): + logits = self.model(x) + return [logits] diff --git a/model/losses/__init__.py b/model/losses/__init__.py new file mode 100644 index 0000000..759ea05 --- /dev/null +++ b/model/losses/__init__.py @@ -0,0 +1,50 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pytorch_toolbelt import losses as L + +from model.losses.pseudo_ce_loss import PseudoCrossEntropyLoss + + +class LossFunction(nn.Module): + def __init__(self): + super(LossFunction, self).__init__() + + self.loss_func1 = nn.CrossEntropyLoss() + self.loss_func2 = L.DiceLoss(mode='multiclass') + + def forward(self, logits, target): + loss = self.loss_func1(logits[0], target) + 0.2 * self.loss_func2(logits[0], target) + return loss + + +class SelfCorrectionLossFunction(nn.Module): + def __init__(self, cycle=12): + super(SelfCorrectionLossFunction, self).__init__() + self.cycle = cycle + + self.sc_loss_func1 = PseudoCrossEntropyLoss() + self.sc_loss_func2 = L.DiceLoss(mode='multiclass') + + def forward(self, predicts, target, soft_predict, cycle_n): + with torch.no_grad: + soft_predict = F.softmax(soft_predict, dim=1) + soft_predict = self.weighted(self.to_one_hot(target, soft_predict.size(1)), soft_predict, + alpha=1. / (cycle_n + 1)) + loss1 = self.sc_loss_func1(predicts[0], soft_predict) + loss2 = self.sc_loss_func2(predicts, target) + return loss1 + 0.2 * loss2 + + @staticmethod + def weighted(target_one_hot, soft_predict, alpha): + soft_predict = alpha * target_one_hot + (1 - alpha) * soft_predict + return soft_predict + + @staticmethod + def to_one_hot(tensor, num_cls, dim=1, ignore_index=255): + b, h, w = tensor.shape + tensor[tensor == ignore_index] = 0 + onehot_tensor = torch.zeros(b, num_cls, h, w).cuda() + onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1) + return onehot_tensor diff --git a/model/losses/__pycache__/__init__.cpython-36.pyc b/model/losses/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..689b3ec Binary files /dev/null and b/model/losses/__pycache__/__init__.cpython-36.pyc differ diff --git a/model/losses/__pycache__/pseudo_ce_loss.cpython-36.pyc b/model/losses/__pycache__/pseudo_ce_loss.cpython-36.pyc new file mode 100644 index 0000000..a7d0505 Binary files /dev/null and b/model/losses/__pycache__/pseudo_ce_loss.cpython-36.pyc differ diff --git a/model/losses/pseudo_ce_loss.py b/model/losses/pseudo_ce_loss.py new file mode 100644 index 0000000..4bc8ac9 --- /dev/null +++ b/model/losses/pseudo_ce_loss.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torch import Tensor + + +class PseudoCrossEntropyLoss(nn.Module): + def __init__(self, dim=1): + super(PseudoCrossEntropyLoss, self).__init__() + self.dim = dim + + def forward(self, input: Tensor, target: Tensor): + input_log_prob = F.log_softmax(input, dim=self.dim) + loss = torch.sum(-input_log_prob * target, dim=self.dim) + return loss.mean() diff --git a/model/optim/__init__.py b/model/optim/__init__.py new file mode 100644 index 0000000..50eea8f --- /dev/null +++ b/model/optim/__init__.py @@ -0,0 +1,78 @@ +import torch.optim as optim + +from .radam import RAdam +from .lookahead import Lookahead +from .cyclicLR import CyclicCosAnnealingLR +from .warmup_scheduler import GradualWarmupScheduler + + +def get_optimizer(params, optimizer_cfg): + if optimizer_cfg['mode'] == 'SGD': + optimizer = optim.SGD(params, lr=optimizer_cfg['lr'], momentum=0.9, + weight_decay=optimizer_cfg['weight_decay'], nesterov=optimizer_cfg['nesterov']) + elif optimizer_cfg['mode'] == 'RAdam': + optimizer = RAdam(params, lr=optimizer_cfg['lr'], betas=(0.9, 0.999), + weight_decay=optimizer_cfg['weight_decay']) + else: + optimizer = optim.Adam(params, lr=optimizer_cfg['lr'], betas=(0.9, 0.999), + weight_decay=optimizer_cfg['weight_decay']) + + if optimizer_cfg['lookahead']: + optimizer = Lookahead(optimizer, k=5, alpha=0.5) + + # todo: add split_weights.py + + return optimizer + + +def get_scheduler(optimizer, scheduler_cfg): + MODE = scheduler_cfg['mode'] + + if MODE == 'OneCycleLR': + scheduler = optim.lr_scheduler.OneCycleLR(optimizer, + max_lr=optimizer.param_groups[0]['lr'], + total_steps=scheduler_cfg['steps'], + pct_start=scheduler_cfg['pct_start'], + final_div_factor=scheduler_cfg['final_div_factor'], + cycle_momentum=scheduler_cfg['cycle_momentum'], + anneal_strategy=scheduler_cfg['anneal_strategy']) + + elif MODE == 'PolyLR': + lr_lambda = lambda step: (1 - step / scheduler_cfg['steps']) ** scheduler_cfg['power'] + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) + + elif MODE == 'CosineAnnealingLR': + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=scheduler_cfg['steps'], + eta_min=scheduler_cfg['eta_min']) + + elif MODE == 'MultiStepLR': + scheduler = optim.lr_scheduler.MultiStepLR(optimizer, + scheduler_cfg['milestones'], + gamma=scheduler_cfg['gamma']) + + elif MODE == 'CosineAnnealingWarmRestarts': + scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, + T_0=scheduler_cfg['T_0'], + T_mult=scheduler_cfg['T_multi'], + eta_min=scheduler_cfg['eta_min']) + + elif MODE == 'CyclicCosAnnealingLR': + scheduler = CyclicCosAnnealingLR(optimizer, + milestones=scheduler_cfg['milestones'], + decay_milestones=scheduler_cfg['decay_milestones'], + eta_min=scheduler_cfg['eta_min'], + gamma=scheduler_cfg['gamma']) + + elif scheduler_cfg.MODE == 'GradualWarmupScheduler': + milestones = list(map(lambda x: x - scheduler_cfg['warmup_steps'], scheduler_cfg['milestones'])) + scheduler_steplr = optim.lr_scheduler.MultiStepLR(optimizer, + milestones=milestones, + gamma=scheduler_cfg['gamma']) + scheduler = GradualWarmupScheduler(optimizer, + multiplier=scheduler_cfg['milestones'], + total_epoch=scheduler_cfg['warmup_steps'], + after_scheduler=scheduler_steplr) + else: + raise ValueError + + return scheduler diff --git a/model/optim/__pycache__/__init__.cpython-36.pyc b/model/optim/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..63f4328 Binary files /dev/null and b/model/optim/__pycache__/__init__.cpython-36.pyc differ diff --git a/model/optim/__pycache__/cyclicLR.cpython-36.pyc b/model/optim/__pycache__/cyclicLR.cpython-36.pyc new file mode 100644 index 0000000..9ba3d09 Binary files /dev/null and b/model/optim/__pycache__/cyclicLR.cpython-36.pyc differ diff --git a/model/optim/__pycache__/lookahead.cpython-36.pyc b/model/optim/__pycache__/lookahead.cpython-36.pyc new file mode 100644 index 0000000..9c0bb0e Binary files /dev/null and b/model/optim/__pycache__/lookahead.cpython-36.pyc differ diff --git a/model/optim/__pycache__/radam.cpython-36.pyc b/model/optim/__pycache__/radam.cpython-36.pyc new file mode 100644 index 0000000..b81b2c1 Binary files /dev/null and b/model/optim/__pycache__/radam.cpython-36.pyc differ diff --git a/model/optim/__pycache__/warmup_scheduler.cpython-36.pyc b/model/optim/__pycache__/warmup_scheduler.cpython-36.pyc new file mode 100644 index 0000000..3f03a30 Binary files /dev/null and b/model/optim/__pycache__/warmup_scheduler.cpython-36.pyc differ diff --git a/model/optim/cyclicLR.py b/model/optim/cyclicLR.py new file mode 100644 index 0000000..fa38311 --- /dev/null +++ b/model/optim/cyclicLR.py @@ -0,0 +1,125 @@ +import math +from bisect import bisect_right +from torch.optim.lr_scheduler import _LRScheduler +from torch.optim.optimizer import Optimizer + + +class CyclicCosAnnealingLR(_LRScheduler): + r""" + + Implements reset on milestones inspired from CosineAnnealingLR pytorch + + Set the learning rate of each parameter group using a cosine annealing + schedule, where :math:`\eta_{max}` is set to the initial lr and + :math:`T_{cur}` is the number of epochs since the last restart in SGDR: + .. math:: + \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 + + \cos(\frac{T_{cur}}{T_{max}}\pi)) + When last_epoch > last set milestone, lr is automatically set to \eta_{min} + It has been proposed in + `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only + implements the cosine annealing part of SGDR, and not the restarts. + Args: + optimizer (Optimizer): Wrapped optimizer. + milestones (list of ints): List of epoch indices. Must be increasing. + decay_milestones(list of ints):List of increasing epoch indices. Ideally,decay values should overlap with milestone points + gamma (float): factor by which to decay the max learning rate at each decay milestone + eta_min (float): Minimum learning rate. Default: 1e-6 + last_epoch (int): The index of last epoch. Default: -1. + + + .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: + https://arxiv.org/abs/1608.03983 + """ + + def __init__(self, optimizer, milestones, decay_milestones=None, gamma=0.5, eta_min=1e-6, last_epoch=-1): + if not list(milestones) == sorted(milestones): + raise ValueError('Milestones should be a list of' + ' increasing integers. Got {}', milestones) + self.eta_min = eta_min + self.milestones = milestones + self.milestones2 = decay_milestones + + self.gamma = gamma + super(CyclicCosAnnealingLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + + if self.last_epoch >= self.milestones[-1]: + return [self.eta_min for base_lr in self.base_lrs] + + idx = bisect_right(self.milestones, self.last_epoch) + + left_barrier = 0 if idx == 0 else self.milestones[idx - 1] + right_barrier = self.milestones[idx] + + width = right_barrier - left_barrier + curr_pos = self.last_epoch - left_barrier + + if self.milestones2: + return [self.eta_min + ( + base_lr * self.gamma ** bisect_right(self.milestones2, self.last_epoch) - self.eta_min) * + (1 + math.cos(math.pi * curr_pos / width)) / 2 + for base_lr in self.base_lrs] + else: + return [self.eta_min + (base_lr - self.eta_min) * + (1 + math.cos(math.pi * curr_pos / width)) / 2 + for base_lr in self.base_lrs] + + +class CyclicLinearLR(_LRScheduler): + r""" + Implements reset on milestones inspired from Linear learning rate decay + + Set the learning rate of each parameter group using a linear decay + schedule, where :math:`\eta_{max}` is set to the initial lr and + :math:`T_{cur}` is the number of epochs since the last restart: + .. math:: + \eta_t = \eta_{min} + (\eta_{max} - \eta_{min})(1 -\frac{T_{cur}}{T_{max}}) + When last_epoch > last set milestone, lr is automatically set to \eta_{min} + + Args: + optimizer (Optimizer): Wrapped optimizer. + milestones (list of ints): List of epoch indices. Must be increasing. + decay_milestones(list of ints):List of increasing epoch indices. Ideally,decay values should overlap with milestone points + gamma (float): factor by which to decay the max learning rate at each decay milestone + eta_min (float): Minimum learning rate. Default: 1e-6 + last_epoch (int): The index of last epoch. Default: -1. + .. _SGDR\: Stochastic Gradient Descent with Warm Restarts: + https://arxiv.org/abs/1608.03983 + """ + + def __init__(self, optimizer, milestones, decay_milestones=None, gamma=0.5, eta_min=1e-6, last_epoch=-1): + if not list(milestones) == sorted(milestones): + raise ValueError('Milestones should be a list of' + ' increasing integers. Got {}', milestones) + self.eta_min = eta_min + + self.gamma = gamma + self.milestones = milestones + self.milestones2 = decay_milestones + super(CyclicLinearLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + + if self.last_epoch >= self.milestones[-1]: + return [self.eta_min for base_lr in self.base_lrs] + + idx = bisect_right(self.milestones, self.last_epoch) + + left_barrier = 0 if idx == 0 else self.milestones[idx - 1] + right_barrier = self.milestones[idx] + + width = right_barrier - left_barrier + curr_pos = self.last_epoch - left_barrier + + if self.milestones2: + return [self.eta_min + ( + base_lr * self.gamma ** bisect_right(self.milestones2, self.last_epoch) - self.eta_min) * + (1. - 1.0 * curr_pos / width) + for base_lr in self.base_lrs] + + else: + return [self.eta_min + (base_lr - self.eta_min) * + (1. - 1.0 * curr_pos / width) + for base_lr in self.base_lrs] \ No newline at end of file diff --git a/model/optim/lookahead.py b/model/optim/lookahead.py new file mode 100644 index 0000000..378d874 --- /dev/null +++ b/model/optim/lookahead.py @@ -0,0 +1,100 @@ +import torch +from torch.optim import Optimizer +from collections import defaultdict + + +class Lookahead(Optimizer): + ''' + PyTorch implementation of the lookahead wrapper. + Lookahead Optimizer: https://arxiv.org/abs/1907.08610 + ''' + + def __init__(self, optimizer, alpha=0.5, k=6, pullback_momentum="none"): + ''' + :param optimizer:inner optimizer + :param k (int): number of lookahead steps + :param alpha(float): linear interpolation factor. 1.0 recovers the inner optimizer. + :param pullback_momentum (str): change to inner optimizer momentum on interpolation update + ''' + if not 0.0 <= alpha <= 1.0: + raise ValueError(f'Invalid slow update rate: {alpha}') + if not 1 <= k: + raise ValueError(f'Invalid lookahead steps: {k}') + self.optimizer = optimizer + self.param_groups = self.optimizer.param_groups + self.alpha = alpha + self.k = k + self.step_counter = 0 + assert pullback_momentum in ["reset", "pullback", "none"] + self.pullback_momentum = pullback_momentum + self.state = defaultdict(dict) + + # Cache the current optimizer parameters + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + param_state['cached_params'] = torch.zeros_like(p.data) + param_state['cached_params'].copy_(p.data) + + def __getstate__(self): + return { + 'state': self.state, + 'optimizer': self.optimizer, + 'alpha': self.alpha, + 'step_counter': self.step_counter, + 'k': self.k, + 'pullback_momentum': self.pullback_momentum + } + + def zero_grad(self): + self.optimizer.zero_grad() + + def state_dict(self): + return self.optimizer.state_dict() + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict) + + def _backup_and_load_cache(self): + """Useful for performing evaluation on the slow weights (which typically generalize better) + """ + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + param_state['backup_params'] = torch.zeros_like(p.data) + param_state['backup_params'].copy_(p.data) + p.data.copy_(param_state['cached_params']) + + def _clear_and_load_backup(self): + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + p.data.copy_(param_state['backup_params']) + del param_state['backup_params'] + + def step(self, closure=None): + """Performs a single Lookahead optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = self.optimizer.step(closure) + self.step_counter += 1 + + if self.step_counter >= self.k: + self.step_counter = 0 + # Lookahead and cache the current optimizer parameters + for group in self.optimizer.param_groups: + for p in group['params']: + param_state = self.state[p] + p.data.mul_(self.alpha).add_(1.0 - self.alpha, param_state['cached_params']) # crucial line + param_state['cached_params'].copy_(p.data) + if self.pullback_momentum == "pullback": + internal_momentum = self.optimizer.state[p]["momentum_buffer"] + self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.alpha).add_( + 1.0 - self.alpha, param_state["cached_mom"]) + param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"] + elif self.pullback_momentum == "reset": + self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) + + return loss diff --git a/model/optim/radam.py b/model/optim/radam.py new file mode 100644 index 0000000..f439c04 --- /dev/null +++ b/model/optim/radam.py @@ -0,0 +1,250 @@ +import math +import torch +from torch.optim.optimizer import Optimizer + + +class RAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): + param['buffer'] = [[None, None, None] for _ in range(10)] + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + buffer=[[None, None, None] for _ in range(10)]) + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + buffered = group['buffer'][int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( + N_sma_max - 2)) / (1 - beta1 ** state['step']) + elif self.degenerated_to_sgd: + step_size = 1.0 / (1 - beta1 ** state['step']) + else: + step_size = -1 + buffered[2] = step_size + + # more conservative since it's an approximated value + if N_sma >= 5: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom) + p.data.copy_(p_data_fp32) + elif step_size > 0: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + p_data_fp32.add_(-step_size * group['lr'], exp_avg) + p.data.copy_(p_data_fp32) + + return loss + + +class PlainRAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super(PlainRAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(PlainRAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + + # more conservative since it's an approximated value + if N_sma >= 5: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + step_size = group['lr'] * math.sqrt( + (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / ( + N_sma_max - 2)) / (1 - beta1 ** state['step']) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + p.data.copy_(p_data_fp32) + elif self.degenerated_to_sgd: + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + step_size = group['lr'] / (1 - beta1 ** state['step']) + p_data_fp32.add_(-step_size, exp_avg) + p.data.copy_(p_data_fp32) + + return loss + + +class AdamW(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, warmup=warmup) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + if group['warmup'] > state['step']: + scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] + else: + scheduled_lr = group['lr'] + + step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) + + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + + p.data.copy_(p_data_fp32) + + return loss \ No newline at end of file diff --git a/model/optim/warmup_scheduler.py b/model/optim/warmup_scheduler.py new file mode 100644 index 0000000..1a8d7eb --- /dev/null +++ b/model/optim/warmup_scheduler.py @@ -0,0 +1,65 @@ +from torch.optim.lr_scheduler import _LRScheduler +from torch.optim.lr_scheduler import ReduceLROnPlateau + + +class GradualWarmupScheduler(_LRScheduler): + """ Gradually warm-up(increasing) learning rate in optimizer. + Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'. + Args: + optimizer (Optimizer): Wrapped optimizer. + multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr. + total_epoch: target learning rate is reached at total_epoch, gradually + after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau) + """ + + def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None): + self.multiplier = multiplier + if self.multiplier < 1.: + raise ValueError('multiplier should be greater thant or equal to 1.') + self.total_epoch = total_epoch + self.after_scheduler = after_scheduler + self.finished = False + super(GradualWarmupScheduler, self).__init__(optimizer) + + def get_lr(self): + if self.last_epoch > self.total_epoch: + if self.after_scheduler: + if not self.finished: + self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs] + self.finished = True + return self.after_scheduler.get_last_lr() + return [base_lr * self.multiplier for base_lr in self.base_lrs] + + if self.multiplier == 1.0: + return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs] + else: + return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in + self.base_lrs] + + def step_ReduceLROnPlateau(self, metrics, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + self.last_epoch = epoch if epoch != 0 else 1 # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning + if self.last_epoch <= self.total_epoch: + warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in + self.base_lrs] + for param_group, lr in zip(self.optimizer.param_groups, warmup_lr): + param_group['lr'] = lr + else: + if epoch is None: + self.after_scheduler.step(metrics, None) + else: + self.after_scheduler.step(metrics, epoch - self.total_epoch) + + def step(self, epoch=None, metrics=None): + if type(self.after_scheduler) != ReduceLROnPlateau: + if self.finished and self.after_scheduler: + if epoch is None: + self.after_scheduler.step(None) + else: + self.after_scheduler.step(epoch - self.total_epoch) + self._last_lr = self.after_scheduler.get_last_lr() + else: + return super(GradualWarmupScheduler, self).step(epoch) + else: + self.step_ReduceLROnPlateau(metrics, epoch) diff --git a/model/tools/Balanced_DataParallel.py b/model/tools/Balanced_DataParallel.py new file mode 100644 index 0000000..bcedc98 --- /dev/null +++ b/model/tools/Balanced_DataParallel.py @@ -0,0 +1,112 @@ +import torch + +from torch.nn.parallel import DataParallel +from torch.nn.parallel._functions import Scatter +from torch.nn.parallel.parallel_apply import parallel_apply + + +def scatter(inputs, target_gpus, chunk_sizes, dim=0): + r""" + Slices tensors into approximately equal chunks and + distributes them across given GPUs. Duplicates + references to objects that are not tensors. + """ + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + try: + return Scatter.apply(target_gpus, chunk_sizes, dim, obj) + except: + print('obj', obj.size()) + print('dim', dim) + print('chunk_sizes', chunk_sizes) + quit() + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + return list(map(list, zip(*map(scatter_map, obj)))) + if isinstance(obj, dict) and len(obj) > 0: + return list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return [obj for targets in target_gpus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + + +def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0): + r"""Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs + + +class BalancedDataParallel(DataParallel): + def __init__(self, gpu0_bsz, *args, **kwargs): + self.gpu0_bsz = gpu0_bsz + super().__init__(*args, **kwargs) + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + if self.gpu0_bsz == 0: + device_ids = self.device_ids[1:] + else: + device_ids = self.device_ids + inputs, kwargs = self.scatter(inputs, kwargs, device_ids) + + # print('len(inputs): ', str(len(inputs))) + # print('self.device_ids[:len(inputs)]', str(self.device_ids[:len(inputs)])) + + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + if self.gpu0_bsz == 0: + replicas = self.replicate(self.module, self.device_ids) + else: + replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) + + # replicas = self.replicate(self.module, device_ids[:len(inputs)]) + if self.gpu0_bsz == 0: + replicas = replicas[1:] + + # print('replicas:', str(len(replicas))) + + outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs) + return self.gather(outputs, self.output_device) + + def parallel_apply(self, replicas, device_ids, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, device_ids[:len(inputs)]) + + def scatter(self, inputs, kwargs, device_ids): + bsz = inputs[0].size(self.dim) + num_dev = len(self.device_ids) + gpu0_bsz = self.gpu0_bsz + bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1) + if gpu0_bsz < bsz_unit: + chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1) + delta = bsz - sum(chunk_sizes) + for i in range(delta): + chunk_sizes[i + 1] += 1 + if gpu0_bsz == 0: + chunk_sizes = chunk_sizes[1:] + else: + return super().scatter(inputs, kwargs, device_ids) + + print('bsz: ', bsz) + print('num_dev: ', num_dev) + print('gpu0_bsz: ', gpu0_bsz) + print('bsz_unit: ', bsz_unit) + print('chunk_sizes: ', chunk_sizes) + return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim) diff --git a/model/tools/__init__.py b/model/tools/__init__.py new file mode 100644 index 0000000..af87f13 --- /dev/null +++ b/model/tools/__init__.py @@ -0,0 +1,2 @@ +from .Balanced_DataParallel import BalancedDataParallel +from .split_weights import split_weights diff --git a/model/tools/__pycache__/Balanced_DataParallel.cpython-36.pyc b/model/tools/__pycache__/Balanced_DataParallel.cpython-36.pyc new file mode 100644 index 0000000..b89543b Binary files /dev/null and b/model/tools/__pycache__/Balanced_DataParallel.cpython-36.pyc differ diff --git a/model/tools/__pycache__/__init__.cpython-36.pyc b/model/tools/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..ee0e836 Binary files /dev/null and b/model/tools/__pycache__/__init__.cpython-36.pyc differ diff --git a/model/tools/__pycache__/metric.cpython-36.pyc b/model/tools/__pycache__/metric.cpython-36.pyc new file mode 100644 index 0000000..b94a4f7 Binary files /dev/null and b/model/tools/__pycache__/metric.cpython-36.pyc differ diff --git a/model/tools/__pycache__/split_weights.cpython-36.pyc b/model/tools/__pycache__/split_weights.cpython-36.pyc new file mode 100644 index 0000000..618b3dc Binary files /dev/null and b/model/tools/__pycache__/split_weights.cpython-36.pyc differ diff --git a/model/tools/metric.py b/model/tools/metric.py new file mode 100644 index 0000000..05f2238 --- /dev/null +++ b/model/tools/metric.py @@ -0,0 +1,75 @@ +import numpy as np + + +class SegmentationMetric: + def __init__(self, num_classes): + self.num_classes = num_classes + self.confusionMatrix = np.zeros((self.num_classes, self.num_classes)) + + def genConfusionMatrix(self, predict, label): + """ + row: Ture label + column: Predict result + """ + # remove classes from unlabeled pixels in gt image and predict + mask = (label >= 0) & (label < self.num_classes) + label = self.num_classes * label[mask] + predict[mask] + count = np.bincount(label, minlength=self.num_classes ** 2) + confusionMatrix = count.reshape(self.num_classes, self.num_classes) + return confusionMatrix + + def pixelAccuracy(self): + """ + return all class overall pixel accuracy + PA = acc = (TP + TN) / (TP + TN + FP + TN) + """ + Acc = np.diag(self.confusionMatrix).sum() / self.confusionMatrix.sum() + return Acc + + def classPixelAccuracy(self): + """ + return each category pixel accuracy(A more accurate way to call it precision) + Acc = (TP) / TP + FP + 返回的是一个列表值,如:[0.90, 0.80, 0.96],表示类别1 2 3各类别的预测准确率 + """ + classAcc = np.diag(self.confusionMatrix) / self.confusionMatrix.sum(axis=1) + return classAcc + + def meanPixelAccuracy(self): + """ + 返回单个值,如:np.nanmean([0.90, 0.80, 0.96, nan, nan]) = (0.90 + 0.80 + 0.96) / 3 = 0.89 + """ + classAcc = self.classPixelAccuracy() + meanAcc = np.nanmean(classAcc) + return meanAcc + + def meanIntersectionOverUnion(self): + """ + Intersection = TP + Union = TP + FP + FN + IoU = TP / (TP + FP + FN) + """ + intersection = np.diag(self.confusionMatrix) # 取对角元素的值,返回列表 + union = np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) - np.diag( + self.confusionMatrix) # axis = 1表示混淆矩阵行的值,返回列表; axis = 0表示取混淆矩阵列的值,返回列表 + IoU = intersection / union # 返回列表,其值为各个类别的IoU + mIoU = np.nanmean(IoU) # 求各类别IoU的平均 + return IoU, mIoU + + def FrequencyWeightedIntersectionOverUnion(self): + """ + FWIOU = [(TP+FN)/(TP+FP+TN+FN)] *[TP / (TP + FP + FN)] + """ + freq = np.sum(self.confusionMatrix, axis=1) / np.sum(self.confusionMatrix) + iu = np.diag(self.confusionMatrix) / ( + np.sum(self.confusionMatrix, axis=1) + np.sum(self.confusionMatrix, axis=0) - + np.diag(self.confusionMatrix)) + FWIoU = (freq[freq > 0] * iu[freq > 0]).sum() + return FWIoU + + def addBatch(self, predict, label): + assert predict.shape == label.shape + self.confusionMatrix += self.genConfusionMatrix(predict, label) + + def reset(self): + self.confusionMatrix = np.zeros((self.num_classes, self.num_classes)) diff --git a/model/tools/split_weights.py b/model/tools/split_weights.py new file mode 100644 index 0000000..dc759a3 --- /dev/null +++ b/model/tools/split_weights.py @@ -0,0 +1,34 @@ +import torch.nn as nn + + +def split_weights(net): + """split network weights into to categlories, + one are weights in conv layer and linear layer, + others are other learnable paramters(conv bias, + bn weights, bn bias, linear bias) + Args: + net: network architecture + + Returns: + a dictionary of params splite into to categlories + """ + + decay = [] + no_decay = [] + + for m in net.modules(): + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): + decay.append(m.weight) + + if m.bias is not None: + no_decay.append(m.bias) + + else: + if hasattr(m, 'weight'): + no_decay.append(m.weight) + if hasattr(m, 'bias'): + no_decay.append(m.bias) + + assert len(list(net.parameters())) == len(decay) + len(no_decay) + + return [dict(params=decay), dict(params=no_decay, weight_decay=0)] diff --git a/model/unet.py b/model/unet.py new file mode 100644 index 0000000..7bc7ea1 --- /dev/null +++ b/model/unet.py @@ -0,0 +1,22 @@ +import torch.nn as nn +import segmentation_models_pytorch as smp + + +class Unet(nn.Module): + def __init__(self, num_classes): + super(Unet, self).__init__() + + self.model = smp.Unet( + encoder_name="se_resnext50_32x4d", + encoder_depth=5, + encoder_weights='imagenet', + decoder_use_batchnorm=True, + decoder_channels=[256, 128, 64, 32, 16], + decoder_attention_type='scse', + in_channels=3, + classes=num_classes, + ) + + def forward(self, x): + logits = self.model(x) + return [logits]