add STDC

zh320 · zh320 · commit 48430f008e35 · 2024-01-22T22:24:00.000+08:00
Former-commit-id: 2d64fd1765c65934f53510384d954ced9e917b99 [formerly 0428310] Former-commit-id: ee8150c29f36fc54cf4c5bc53998abd5be51f8aa
diff --git a/README.md b/README.md
diff --git a/configs/base_config.py b/configs/base_config.py
@@ -12,6 +12,13 @@ def __init__(self,):
         self.decoder = None
         self.encoder_weights = 'imagenet'
         
+        # Detail Head (For STDC)
+        self.use_detail_head = False
+        self.detail_thrs = 0.1
+        self.detail_loss_coef = 1.0
+        self.dice_loss_coef = 1.0
+        self.bce_loss_coef = 1.0
+        
         # Training
         self.total_epoch = 200
         self.base_lr = 0.01
diff --git a/configs/parser.py b/configs/parser.py
@@ -32,7 +32,8 @@ def get_parser():
                  'enet', 'erfnet', 'esnet', 'espnet', 'espnetv2', 'farseenet',
                  'fastscnn', 'fddwnet', 'fpenet', 'fssnet', 'icnet', 'lednet',
                  'linknet', 'liteseg', 'mininet', 'mininetv2', 'ppliteseg', 
-                 'regseg', 'segnet', 'shelfnet', 'sqnet', 'swiftnet', 'smp'],
+                 'regseg', 'segnet', 'shelfnet', 'sqnet', 'stdc', 'swiftnet',
+                 'smp'],
         help='choose which model you want to use')
     parser.add_argument('--encoder', type=str, default=None, 
         help='choose which encoder of SMP model you want to use (please refer to SMP repo)')
diff --git a/core/__init__.py b/core/__init__.py
@@ -1,3 +1,3 @@
 from .base_trainer import BaseTrainer
 from .seg_trainer import SegTrainer
-from .loss import get_loss_fn, kd_loss_fn
+from .loss import get_loss_fn, kd_loss_fn, get_detail_loss_fn
diff --git a/core/loss.py b/core/loss.py
@@ -16,9 +16,42 @@ def forward(self, logits, labels):
         loss_hard = loss[loss > self.thresh]
         if loss_hard.numel() < n_min:
             loss_hard, _ = loss.topk(n_min)
+
         return torch.mean(loss_hard)
 
 
+class DiceLoss(nn.Module):
+    def __init__(self, smooth=1):
+        super(DiceLoss, self).__init__()
+        self.smooth = smooth
+
+    def forward(self, logits, labels):
+        logits = torch.flatten(logits, 1)
+        labels = torch.flatten(labels, 1)
+
+        intersection = torch.sum(logits * labels, dim=1)
+        loss = 1 - ((2 * intersection + self.smooth) / (logits.sum(1) + labels.sum(1) + self.smooth))
+
+        return torch.mean(loss)
+
+
+class DetailLoss(nn.Module):
+    '''Implement detail loss used in paper
+       `Rethinking BiSeNet For Real-time Semantic Segmentation`'''
+    def __init__(self, dice_loss_coef=1., bce_loss_coef=1., smooth=1):
+        super(DetailLoss, self).__init__()
+        self.dice_loss_coef = dice_loss_coef
+        self.bce_loss_coef = bce_loss_coef
+        self.dice_loss_fn = DiceLoss(smooth)
+        self.bce_loss_fn = nn.BCEWithLogitsLoss()
+
+    def forward(self, logits, labels):
+        loss = self.dice_loss_coef * self.dice_loss_fn(logits, labels) + \
+               self.bce_loss_coef * self.bce_loss_fn(logits, labels)
+
+        return loss
+
+
 def get_loss_fn(config, device):
     if config.class_weights is None:
         weights = None
@@ -28,16 +61,22 @@ def get_loss_fn(config, device):
     if config.loss_type == 'ce':
         criterion = nn.CrossEntropyLoss(ignore_index=config.ignore_index, 
                                         reduction=config.reduction, weight=weights)
-    
+
     elif config.loss_type == 'ohem':
         criterion = OhemCELoss(thresh=config.ohem_thrs, ignore_index=config.ignore_index)  
 
     else:
         raise NotImplementedError(f"Unsupport loss type: {config.loss_type}")
-        
+
     return criterion
-    
-    
+
+
+def get_detail_loss_fn(config):
+    detail_loss_fn = DetailLoss(dice_loss_coef=config.dice_loss_coef, bce_loss_coef=config.bce_loss_coef)
+
+    return detail_loss_fn
+
+
 def kd_loss_fn(config, outputs, outputsT):
     if config.kd_loss_type == 'kl_div':
         lossT = F.kl_div(F.log_softmax(outputs/config.kd_temperature, dim=1),
diff --git a/core/seg_trainer.py b/core/seg_trainer.py
@@ -21,6 +21,13 @@ def __init__(self, config):
             self.teacher_model = get_teacher_model(config, self.device)
             self.metrics = get_seg_metrics(config).to(self.device)
 
+            if config.use_detail_head:
+                from .loss import get_detail_loss_fn
+                from models import LaplacianConv
+
+                self.laplacian_conv = LaplacianConv(self.device)
+                self.detail_loss_fn = get_detail_loss_fn(config)
+
     def train_one_epoch(self, config):
         self.model.train()
         
@@ -57,14 +64,33 @@ def train_one_epoch(self, config):
                     with amp.autocast(enabled=config.amp_training):
                         loss += config.aux_coef[i] * self.loss_fn(preds_aux[i], masks_aux)
 
+            # Detail loss proposed in paper for model STDC
+            elif config.use_detail_head:
+                masks_detail = masks.unsqueeze(1).float()
+                masks_detail = self.laplacian_conv(masks_detail)
+
+                with amp.autocast(enabled=config.amp_training):
+                    # Detail ground truth
+                    masks_detail = self.model.module.detail_conv(masks_detail)
+                    masks_detail[masks_detail > config.detail_thrs] = 1
+                    masks_detail[masks_detail <= config.detail_thrs] = 0
+                    detail_size = masks_detail.size()[2:]
+
+                    preds, preds_detail = self.model(images, is_training=True)
+                    preds_detail = F.interpolate(preds_detail, detail_size, mode='bilinear', align_corners=True)
+                    loss_detail = self.detail_loss_fn(preds_detail, masks_detail)
+                    loss = self.loss_fn(preds, masks) + config.detail_loss_coef * loss_detail
+
             else:
                 with amp.autocast(enabled=config.amp_training):
                     preds = self.model(images)
                     loss = self.loss_fn(preds, masks)
 
             if config.use_tb and self.main_rank:
                 self.writer.add_scalar('train/loss', loss.detach(), self.train_itrs)
-            
+                if config.use_detail_head:
+                    self.writer.add_scalar('train/loss_detail', loss_detail.detach(), self.train_itrs)
+
             # Knowledge distillation
             if config.kd_training:
                 with amp.autocast(enabled=config.amp_training):
@@ -75,8 +101,8 @@ def train_one_epoch(self, config):
                     loss += config.kd_loss_coefficient * loss_kd
 
                 if config.use_tb and self.main_rank:
-                    self.writer.add_scalar('train/loss_kd', loss_kd.detach(), self.train_itrs)  
-                    self.writer.add_scalar('train/loss_total', loss.detach(), self.train_itrs)      
+                    self.writer.add_scalar('train/loss_kd', loss_kd.detach(), self.train_itrs)
+                    self.writer.add_scalar('train/loss_total', loss.detach(), self.train_itrs)
                    
             # Backward path
             self.scaler.scale(loss).backward()
diff --git a/models/__init__.py b/models/__init__.py
@@ -34,6 +34,7 @@
 from .segnet import SegNet
 from .shelfnet import ShelfNet
 from .sqnet import SQNet
+from .stdc import STDC, LaplacianConv
 from .swiftnet import SwiftNet
 
 
@@ -54,10 +55,13 @@ def get_model(config):
                 'linknet':LinkNet, 'liteseg':LiteSeg, 'mininet':MiniNet, 
                 'mininetv2':MiniNetv2, 'ppliteseg':PPLiteSeg, 'regseg':RegSeg,
                 'segnet':SegNet, 'shelfnet':ShelfNet, 'sqnet':SQNet, 
-                'swiftnet':SwiftNet,}
+                'stdc':STDC, 'swiftnet':SwiftNet,}
 
     # The following models currently support auxiliary heads
     aux_models = ['bisenetv2', 'ddrnet', 'icnet']
+
+    # The following models currently support detail heads
+    detail_head_models = ['stdc']
     
     if config.model == 'smp':   # Use segmentation models pytorch
         if config.decoder not in decoder_hub:
@@ -70,7 +74,14 @@ def get_model(config):
     elif config.model in model_hub.keys():
         if config.model in aux_models:
             model = model_hub[config.model](num_class=config.num_class, use_aux=config.use_aux)
+        elif config.model in detail_head_models:
+            model = model_hub[config.model](num_class=config.num_class, use_detail_head=config.use_detail_head, use_aux=config.use_aux)
         else:
+            if config.use_aux:
+                raise ValueError(f'Model {config.model} does not support auxiliary heads.\n')
+            if config.use_detail_head:
+                raise ValueError(f'Model {config.model} does not support detail heads.\n')
+
             model = model_hub[config.model](num_class=config.num_class)
 
     else:
@@ -83,7 +94,7 @@ def get_teacher_model(config, device):
     if config.kd_training:
         if not os.path.isfile(config.teacher_ckpt):
             raise ValueError(f'Could not find teacher checkpoint at path {config.teacher_ckpt}.')
-        
+
         if config.teacher_decoder not in decoder_hub.keys():
             raise ValueError(f"Unsupported teacher decoder type: {config.teacher_decoder}")      
 
@@ -93,10 +104,10 @@ def get_teacher_model(config, device):
         teacher_ckpt = torch.load(config.teacher_ckpt, map_location=torch.device('cpu'))
         model.load_state_dict(teacher_ckpt['state_dict'])
         del teacher_ckpt
-            
+
         model = model.to(device)    
         model.eval()
     else:
         model = None
-        
+
     return model
diff --git a/models/stdc.py b/models/stdc.py
@@ -0,0 +1,142 @@
+"""
+Paper:      Rethinking BiSeNet For Real-time Semantic Segmentation
+Url:        https://arxiv.org/abs/2104.13188
+Create by:  zh320
+Date:       2024/01/20
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .modules import conv1x1, ConvBNAct, SegHead
+from .bisenetv1 import AttentionRefinementModule, FeatureFusionModule
+
+
+class STDC(nn.Module):
+    def __init__(self, num_class=1, n_channel=3, encoder_type='stdc1', use_detail_head=False, use_aux=False, 
+                    act_type='relu'):
+        super(STDC, self).__init__()
+        repeat_times_hub = {'stdc1': [1,1,1], 'stdc2': [3,4,2]}
+        if encoder_type not in repeat_times_hub.keys():
+            raise ValueError('Unsupported encoder type.\n')
+        repeat_times = repeat_times_hub[encoder_type]
+        assert not use_detail_head * use_aux, 'Currently only support either aux-head or detail head.\n'
+        self.use_detail_head = use_detail_head
+        self.use_aux = use_aux
+
+        self.stage1 = ConvBNAct(n_channel, 32, 3, 2)
+        self.stage2 = ConvBNAct(32, 64, 3, 2)
+        self.stage3 = self._make_stage(64, 256, repeat_times[0], act_type)
+        self.stage4 = self._make_stage(256, 512, repeat_times[1], act_type)
+        self.stage5 = self._make_stage(512, 1024, repeat_times[2], act_type)
+        
+        if use_aux:
+            self.aux_head3 = SegHead(256, num_class, act_type)
+            self.aux_head4 = SegHead(512, num_class, act_type)
+            self.aux_head5 = SegHead(1024, num_class, act_type)
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.arm4 = AttentionRefinementModule(512)
+        self.arm5 = AttentionRefinementModule(1024)
+        self.conv4 = conv1x1(512, 256)
+        self.conv5 = conv1x1(1024, 256)
+        
+        self.ffm = FeatureFusionModule(256+256, 128, act_type)
+
+        self.seg_head = SegHead(128, num_class, act_type)
+        if use_detail_head:
+            self.detail_head = SegHead(256, 1, act_type)
+            self.detail_conv = conv1x1(3, 1)
+
+    def _make_stage(self, in_channels, out_channels, repeat_times, act_type):
+        layers = [STDCModule(in_channels, out_channels, 2, act_type)]
+        
+        for _ in range(repeat_times):
+            layers.append(STDCModule(out_channels, out_channels, 1, act_type))
+        return nn.Sequential(*layers)
+
+    def forward(self, x, is_training=False):
+        size = x.size()[2:]
+
+        x = self.stage1(x)
+        x = self.stage2(x)
+        x3 = self.stage3(x)
+        if self.use_aux:
+            aux3 = self.aux_head3(x3)
+
+        x4 = self.stage4(x3)
+        if self.use_aux:
+            aux4 = self.aux_head4(x4)
+
+        x5 = self.stage5(x4)
+        if self.use_aux:
+            aux5 = self.aux_head5(x5)
+
+        x5_pool = self.pool(x5)
+        x5 = x5_pool + self.arm5(x5)
+        x5 = self.conv5(x5)
+        x5 = F.interpolate(x5, scale_factor=2, mode='bilinear', align_corners=True)
+
+        x4 = self.arm4(x4)
+        x4 = self.conv4(x4)
+        x4 += x5
+        x4 = F.interpolate(x4, scale_factor=2, mode='bilinear', align_corners=True)
+
+        x = self.ffm(x4, x3)
+        x = self.seg_head(x)
+        x = F.interpolate(x, size, mode='bilinear', align_corners=True)
+
+        if self.use_detail_head and is_training:
+            x_detail = self.detail_head(x3)
+            return x, x_detail
+        elif self.use_aux and is_training:
+            return x, (aux3, aux4, aux5)
+        else:
+            return x
+
+
+class STDCModule(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, act_type):
+        super(STDCModule, self).__init__()
+        if out_channels % 8 != 0:
+            raise ValueError('Output channel should be evenly divided by 8.\n')
+        if stride not in [1, 2]:
+            raise ValueError(f'Unsupported stride: {stride}\n')
+
+        self.stride = stride
+        self.block1 = ConvBNAct(in_channels, out_channels//2, 1)
+        self.block2 = ConvBNAct(out_channels//2, out_channels//4, 3, stride)
+        if self.stride == 2:
+            self.pool = nn.AvgPool2d(3, 2, 1)
+        self.block3 = ConvBNAct(out_channels//4, out_channels//8, 3)
+        self.block4 = ConvBNAct(out_channels//8, out_channels//8, 3)
+
+    def forward(self, x):
+        x1 = self.block1(x)
+        x2 = self.block2(x1)
+        if self.stride == 2:
+            x1 = self.pool(x1)
+        x3 = self.block3(x2)
+        x4 = self.block4(x3)
+
+        return torch.cat([x1, x2, x3, x4], dim=1)
+
+
+class LaplacianConv(nn.Module):
+    def __init__(self, device):
+        super(LaplacianConv, self).__init__()
+        self.laplacian_kernel = torch.tensor([[[[-1.,-1.,-1.],[-1.,8.,-1.],[-1.,-1.,-1.]]]]).to(device)
+
+    def forward(self, lbl):
+        size = lbl.size()[2:]
+        lbl_1x = F.conv2d(lbl, self.laplacian_kernel, stride=1, padding=1)
+        lbl_2x = F.conv2d(lbl, self.laplacian_kernel, stride=2, padding=1)
+        lbl_4x = F.conv2d(lbl, self.laplacian_kernel, stride=4, padding=1)
+
+        lbl_2x = F.interpolate(lbl_2x, size, mode='nearest')
+        lbl_4x = F.interpolate(lbl_4x, size, mode='nearest')
+
+        lbl = torch.cat([lbl_1x, lbl_2x, lbl_4x], dim=1)
+
+        return lbl
diff --git a/tools/get_model_infos.py b/tools/get_model_infos.py
@@ -2,7 +2,7 @@
 from os import path
 sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) )
 
-from configs import MyConfig
+from configs import MyConfig, load_parser
 from models import get_model
 
 
@@ -29,5 +29,6 @@ def cal_model_params(config, imgw=1024, imgh=512):
 
 if __name__ == '__main__':
     config = MyConfig()
+    config = load_parser(config)
     
     cal_model_params(config)
diff --git a/tools/test_speed.py b/tools/test_speed.py
@@ -2,7 +2,7 @@
 from os import path
 sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) )
 
-from configs import MyConfig
+from configs import MyConfig, load_parser
 from models import get_model
 
 
@@ -63,5 +63,6 @@ def test_model_speed(config, ratio=0.5, imgw=2048, imgh=1024, iterations=None):
 
 if __name__ == '__main__':
     config = MyConfig()
-    
+    config = load_parser(config)
+
     test_model_speed(config)