huggingface
diff --git a/‎timm/models/swin_transformer.py‎
Lines changed: 65 additions & 24 deletions b/‎timm/models/swin_transformer.py‎
Lines changed: 65 additions & 24 deletions
@@ -17,7 +17,7 @@
 # --------------------------------------------------------
 import logging
 import math
-from typing import Any, Dict, Callable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Callable, List, Optional, Set, Tuple, Union, Type
 
 import torch
 import torch.nn as nn
@@ -77,7 +77,7 @@ def window_reverse(windows: torch.Tensor, window_size: Tuple[int, int], H: int,
     return x
 
 
-def get_relative_position_index(win_h: int, win_w: int) -> torch.Tensor:
+def get_relative_position_index(win_h: int, win_w: int, device=None) -> torch.Tensor:
     """Get pair-wise relative position index for each token inside the window.
 
     Args:
@@ -88,7 +88,10 @@ def get_relative_position_index(win_h: int, win_w: int) -> torch.Tensor:
         Relative position index tensor.
     """
     # get pair-wise relative position index for each token inside the window
-    coords = torch.stack(ndgrid(torch.arange(win_h), torch.arange(win_w)))  # 2, Wh, Ww
+    coords = torch.stack(ndgrid(
+        torch.arange(win_h, device=device, dtype=torch.long),
+        torch.arange(win_w, device=device, dtype=torch.long),
+    ))  # 2, Wh, Ww
     coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
     relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
     relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
@@ -114,6 +117,8 @@ def __init__(
             qkv_bias: bool = True,
             attn_drop: float = 0.,
             proj_drop: float = 0.,
+            device=None,
+            dtype=None,
     ):
         """
         Args:
@@ -125,6 +130,7 @@ def __init__(
             attn_drop: Dropout ratio of attention weight.
             proj_drop: Dropout ratio of output.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         self.dim = dim
         self.window_size = to_2tuple(window_size)  # Wh, Ww
@@ -137,14 +143,19 @@ def __init__(
         self.fused_attn = use_fused_attn(experimental=True)  # NOTE not tested for prime-time yet
 
         # define a parameter table of relative position bias, shape: 2*Wh-1 * 2*Ww-1, nH
-        self.relative_position_bias_table = nn.Parameter(torch.zeros((2 * win_h - 1) * (2 * win_w - 1), num_heads))
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * win_h - 1) * (2 * win_w - 1), num_heads, **dd))
 
         # get pair-wise relative position index for each token inside the window
-        self.register_buffer("relative_position_index", get_relative_position_index(win_h, win_w), persistent=False)
+        self.register_buffer(
+            "relative_position_index",
+            get_relative_position_index(win_h, win_w, device=device),
+            persistent=False,
+        )
 
-        self.qkv = nn.Linear(dim, attn_dim * 3, bias=qkv_bias)
+        self.qkv = nn.Linear(dim, attn_dim * 3, bias=qkv_bias, **dd)
         self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(attn_dim, dim)
+        self.proj = nn.Linear(attn_dim, dim, **dd)
         self.proj_drop = nn.Dropout(proj_drop)
 
         trunc_normal_(self.relative_position_bias_table, std=.02)
@@ -169,7 +180,11 @@ def set_window_size(self, window_size: Tuple[int, int]) -> None:
                     new_window_size=self.window_size,
                     new_bias_shape=new_bias_shape,
             ))
-            self.register_buffer("relative_position_index", get_relative_position_index(win_h, win_w), persistent=False)
+            self.register_buffer(
+                "relative_position_index",
+                get_relative_position_index(win_h, win_w, device=self.relative_position_bias_table.device),
+                persistent=False,
+            )
 
     def _get_rel_pos_bias(self) -> torch.Tensor:
         relative_position_bias = self.relative_position_bias_table[
@@ -241,8 +256,10 @@ def __init__(
             proj_drop: float = 0.,
             attn_drop: float = 0.,
             drop_path: float = 0.,
-            act_layer: Callable = nn.GELU,
-            norm_layer: Callable = nn.LayerNorm,
+            act_layer: Type[nn.Module] = nn.GELU,
+            norm_layer: Type[nn.Module] = nn.LayerNorm,
+            device=None,
+            dtype=None,
     ):
         """
         Args:
@@ -261,6 +278,7 @@ def __init__(
             act_layer: Activation layer.
             norm_layer: Normalization layer.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
@@ -271,7 +289,7 @@ def __init__(
         self.window_area = self.window_size[0] * self.window_size[1]
         self.mlp_ratio = mlp_ratio
 
-        self.norm1 = norm_layer(dim)
+        self.norm1 = norm_layer(dim, **dd)
         self.attn = WindowAttention(
             dim,
             num_heads=num_heads,
@@ -280,25 +298,32 @@ def __init__(
             qkv_bias=qkv_bias,
             attn_drop=attn_drop,
             proj_drop=proj_drop,
+            **dd,
         )
         self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
-        self.norm2 = norm_layer(dim)
+        self.norm2 = norm_layer(dim, **dd)
         self.mlp = Mlp(
             in_features=dim,
             hidden_features=int(dim * mlp_ratio),
             act_layer=act_layer,
             drop=proj_drop,
+            **dd,
         )
         self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
 
         self.register_buffer(
             "attn_mask",
-            None if self.dynamic_mask else self.get_attn_mask(),
+            None if self.dynamic_mask else self.get_attn_mask(**dd),
             persistent=False,
         )
 
-    def get_attn_mask(self, x: Optional[torch.Tensor] = None) -> Optional[torch.Tensor]:
+    def get_attn_mask(
+            self,
+            x: Optional[torch.Tensor] = None,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ) -> Optional[torch.Tensor]:
         if any(self.shift_size):
             # calculate attention mask for SW-MSA
             if x is not None:
@@ -307,8 +332,8 @@ def get_attn_mask(self, x: Optional[torch.Tensor] = None) -> Optional[torch.Tens
                 dtype = x.dtype
             else:
                 H, W = self.input_resolution
-                device = None
-                dtype = None
+                device = device
+                dtype = dtype
             H = math.ceil(H / self.window_size[0]) * self.window_size[0]
             W = math.ceil(W / self.window_size[1]) * self.window_size[1]
             img_mask = torch.zeros((1, H, W, 1), dtype=dtype, device=device)  # 1 H W 1
@@ -372,9 +397,11 @@ def set_input_size(
         self.window_size, self.shift_size = self._calc_window_shift(window_size)
         self.window_area = self.window_size[0] * self.window_size[1]
         self.attn.set_window_size(self.window_size)
+        device = self.attn_mask.device if self.attn_mask is not None else None
+        dtype = self.attn_mask.dtype if self.attn_mask is not None else None
         self.register_buffer(
             "attn_mask",
-            None if self.dynamic_mask else self.get_attn_mask(),
+            None if self.dynamic_mask else self.get_attn_mask(device=device, dtype=dtype),
             persistent=False,
         )
 
@@ -444,19 +471,22 @@ def __init__(
             self,
             dim: int,
             out_dim: Optional[int] = None,
-            norm_layer: Callable = nn.LayerNorm,
+            norm_layer: Type[nn.Module] = nn.LayerNorm,
+            device=None,
+            dtype=None,
     ):
         """
         Args:
             dim: Number of input channels.
             out_dim: Number of output channels (or 2 * dim if None)
             norm_layer: Normalization layer.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         self.dim = dim
         self.out_dim = out_dim or 2 * dim
-        self.norm = norm_layer(4 * dim)
-        self.reduction = nn.Linear(4 * dim, self.out_dim, bias=False)
+        self.norm = norm_layer(4 * dim, **dd)
+        self.reduction = nn.Linear(4 * dim, self.out_dim, bias=False, **dd)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass.
@@ -502,7 +532,9 @@ def __init__(
             proj_drop: float = 0.,
             attn_drop: float = 0.,
             drop_path: Union[List[float], float] = 0.,
-            norm_layer: Callable = nn.LayerNorm,
+            norm_layer: Type[nn.Module] = nn.LayerNorm,
+            device=None,
+            dtype=None,
     ):
         """
         Args:
@@ -521,6 +553,7 @@ def __init__(
             drop_path: Stochastic depth rate.
             norm_layer: Normalization layer.
         """
+        dd = {'device': device, 'dtype': dtype}
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
@@ -536,6 +569,7 @@ def __init__(
                 dim=dim,
                 out_dim=out_dim,
                 norm_layer=norm_layer,
+                **dd,
             )
         else:
             assert dim == out_dim
@@ -558,6 +592,7 @@ def __init__(
                 attn_drop=attn_drop,
                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                 norm_layer=norm_layer,
+                **dd,
             )
             for i in range(depth)])
 
@@ -631,9 +666,11 @@ def __init__(
             proj_drop_rate: float = 0.,
             attn_drop_rate: float = 0.,
             drop_path_rate: float = 0.1,
-            embed_layer: Callable = PatchEmbed,
-            norm_layer: Union[str, Callable] = nn.LayerNorm,
+            embed_layer: Type[nn.Module] = PatchEmbed,
+            norm_layer: Union[str, Type[nn.Module]] = nn.LayerNorm,
             weight_init: str = '',
+            device=None,
+            dtype=None,
             **kwargs,
     ):
         """
@@ -656,6 +693,7 @@ def __init__(
             norm_layer (nn.Module): Normalization layer.
         """
         super().__init__()
+        dd = {'device': device, 'dtype': dtype}
         assert global_pool in ('', 'avg')
         self.num_classes = num_classes
         self.global_pool = global_pool
@@ -678,6 +716,7 @@ def __init__(
             norm_layer=norm_layer,
             strict_img_size=strict_img_size,
             output_fmt='NHWC',
+            **dd,
         )
         patch_grid = self.patch_embed.grid_size
 
@@ -715,20 +754,22 @@ def __init__(
                 attn_drop=attn_drop_rate,
                 drop_path=dpr[i],
                 norm_layer=norm_layer,
+                **dd,
             )]
             in_dim = out_dim
             if i > 0:
                 scale *= 2
             self.feature_info += [dict(num_chs=out_dim, reduction=patch_size * scale, module=f'layers.{i}')]
         self.layers = nn.Sequential(*layers)
 
-        self.norm = norm_layer(self.num_features)
+        self.norm = norm_layer(self.num_features, **dd)
         self.head = ClassifierHead(
             self.num_features,
             num_classes,
             pool_type=global_pool,
             drop_rate=drop_rate,
             input_fmt=self.output_fmt,
+            **dd,
         )
         if weight_init != 'skip':
             self.init_weights(weight_init)